Skip to content

Commit

Permalink
Revert "feat: duplicate creation date metadata ICIJ/datashare#789"
Browse files Browse the repository at this point in the history
This reverts commit 4dc1634.
  • Loading branch information
mvanzalu committed Nov 21, 2022
1 parent 93db96c commit 1147362
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 65 deletions.
49 changes: 19 additions & 30 deletions extract-lib/src/main/java/org/icij/spewer/MetadataTransformer.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 5,14 @@

import java.io.IOException;
import java.io.Serializable;

import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.time.temporal.TemporalAccessor;

import java.util.*;
import java.util.stream.Stream;

Expand All @@ -28,14 30,14 @@ public class MetadataTransformer implements Serializable {
// Deduplicate content types (Tika seems to add these sometimes, especially for RTF files).
Metadata.CONTENT_TYPE.toLowerCase(Locale.ENGLISH),

// Deduplicate titles (appear in bad HTML files).
TikaCoreProperties.TITLE.getName().toLowerCase(Locale.ENGLISH),
// Deduplicate titles (appear in bad HTML files).
TikaCoreProperties.TITLE.getName().toLowerCase(Locale.ENGLISH),

// Deduplicate these properties contained in some MSHTML documents.
TITLE,
"originator",
"generator",
"progid");
// Deduplicate these properties contained in some MSHTML documents.
TITLE,
"originator",
"generator",
"progid");
}

private static final long serialVersionUID = -6643888792096975746L;
Expand Down Expand Up @@ -104,14 106,6 @@ public void transform(final ValueConsumer single, final ValueArrayConsumer multi
transform(entry.getKey(), values[0], single);
}
}
String dctermsDate = metadata.get(DublinCore.CREATED);
if( dctermsDate != null ) {
single.accept("tika_metadata_creation_date", dctermsDate);
Instant dctermsDateInstant = getInstant(dctermsDate, null);
if ( dctermsDateInstant != null) {
single.accept("tika_metadata_creation_date_iso8601", dctermsDateInstant.toString());
}
}
} catch (IOException e) {
throw new TaggedIOException(e, getClass());
}
Expand Down Expand Up @@ -167,29 161,18 @@ private void transform(final String normalisedName, final String value, final Va

private void transformDate(final String name, final ValueConsumer consumer) throws IOException {
final Date date = metadata.getDate(dateProperties.get(name));

String strDate = metadata.get(name);
Instant instant = getInstant(strDate, date);

if (null != instant) {
consumer.accept(fields.forMetadataISODate(name), instant.toString());
} else {
throw new IOException(String.format("Unable to parse date \"%s\" from field "
"\"%s\" for ISO 8601 formatting.", strDate, name));
}
}

private Instant getInstant(String strDate, Date date) {
Instant instant = null;

if (null != date) {
instant = date.toInstant();
} else {

// Try some other formats.
for (DateTimeFormatter format: dateFormats) {
final TemporalAccessor accessor;

try {
accessor = format.parseBest(strDate, Instant::from, LocalDateTime::from);
accessor = format.parseBest(metadata.get(name), Instant::from, LocalDateTime::from);
} catch (final DateTimeParseException e) {
continue;
}
Expand All @@ -205,7 188,13 @@ private Instant getInstant(String strDate, Date date) {
break;
}
}
return instant;

if (null != instant) {
consumer.accept(fields.forMetadataISODate(name), instant.toString());
} else {
throw new IOException(String.format("Unable to parse date \"%s\" from field "
"\"%s\" for ISO 8601 formatting.", metadata.get(name), name));
}
}

@FunctionalInterface
Expand Down

This file was deleted.

0 comments on commit 1147362

Please sign in to comment.