in backend/app/extraction/MetadataEnrichment.scala [36:96]
def someIdentity[T](v: T): Option[T] = Some(v)
def safeIntParse(c: String): Option[Int] = Try(c.toInt).toOption
def enrich(metadata: Map[String, Seq[String]]): EnrichedMetadata = EnrichedMetadata(
extractFields(metadata, titleKeys)(someIdentity),
extractFields(metadata, authorKeys)(someIdentity),
extractFields(metadata, createdAtKeys)(isoDateToLong),
extractFields(metadata, lastModifiedKeys)(isoDateToLong),
extractFields(metadata, createdWithKeys)(someIdentity),
extractFields(metadata, pageCountKeys)(safeIntParse),
extractFields(metadata, wordCountKeys)(safeIntParse)
)
// Probably these lists of keys could be simplified now we're on Tika v2.
// But I've left the old ones in for backwards compatibility,
// and because I'm not sure how to test this.
// https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Removedduplicate/triplicatekeys
val titleKeys = List(
"pdf:docinfo:title",
"title",
"dc:title"
)
val authorKeys = List(
"pdf:docinfo:author",
"Author",
"dc:creator",
"creator"
)
val createdAtKeys = List(
"meta:creation-date",
"Creation-Date",
"pdf:docinfo:created",
"dcterms:created"
)
val lastModifiedKeys = List(
"Last-Modified",
"Last-Save-Date",
"dcterms:modified"
)
val createdWithKeys = List(
"pdf:docinfo:producer",
"xmp:CreatorTool"
)
val pageCountKeys = List(
"xmpTPg:NPages",
"meta:page-count",
"Page-Count"
)
val wordCountKeys = List(
"meta:word-count",
"Word-Count"
)
private def extractFields[T](metadata: Map[String, Seq[String]], keys: Seq[String])(transform: String => Option[T]): Option[T] =
Try(metadata.filter { case (k, v) => keys.contains(k) }.values.flatten.groupBy(identity).maxBy(_._2.size)._1).toOption.flatMap(transform)