backend/app/services/index/HitReaders.scala (271 lines of code) (raw):
package services.index
import com.sksamuel.elastic4s.requests.searches.SearchHit
import com.sksamuel.elastic4s.{Hit, HitReader}
import enumeratum.{EnumEntry, PlayEnum}
import extraction.EnrichedMetadata
import model.frontend.{DocumentResultDetails, EmailResultDetails, SearchResult}
import model.index._
import model.{Email, English, Language, Languages, Recipient, Sensitivity, Uri}
import services.events.{Event, EventFields, EventType}
import scala.util.control.NonFatal
import scala.util.{Failure, Success, Try}
object HitReaders {
import IndexFields._
import scala.language.implicitConversions
type FieldMap = Map[String, Any]
// TODO MRB: are these going to cause too many allocations?
implicit class RichFieldMap(fields: FieldMap) {
def field[T](name: String): T = fields(name).asInstanceOf[T]
def longField(name: String): Long = fields(name) match {
case i: java.lang.Long => i.toLong
case i: java.lang.Integer => i.toLong
}
def doubleField(name: String): Double = fields(name) match {
case f: java.lang.Double => f
}
def optField[T](name: String): Option[T] = fields.get(name).flatMap(Option(_)).map(_.asInstanceOf[T])
def optLongField(name: String): Option[Long] = if(fields.contains(name)) { Some(longField(name)) } else { None }
def listField[T](name: String): List[T] = optField[List[T]](name).getOrElse(Nil)
def setField[T](name: String): Set[T] = optField[List[T]](name).map(_.toSet).getOrElse(Set.empty)
def optEnumField[T <: EnumEntry](name: String, playEnum: PlayEnum[T]): Option[T] = optField(name).flatMap(playEnum.withNameOption)
def objectField(name: String): FieldMap = field[FieldMap](name)
def nestedField(name: String): Map[String, Seq[String]] = {
val base = Map.empty[String, Seq[String]]
optField[List[FieldMap]](name).getOrElse(List.empty).foldLeft(base) { (acc, entry) =>
val key = entry.field[String]("key")
val values = entry.get("values") match {
case Some(s: String) => List(s)
case _ => entry.field[List[String]]("values")
}
acc.get(key) match {
case Some(existing) =>
acc + (key -> (existing ++ values))
case None =>
acc + (key -> values)
}
}
}
// Documents are mostly a single language but it is possible to index them against multiple languages:
//
// text: {
// english: <contents>
// }
//
// In this case the OCR field will contain completely different content (eg from a different Tesseract model) but the
// text fields will contain the same values (eg extracted text from the PDF). So we just pick the first one we find.
def multiLanguageField[T](name: String): T = {
fields.field[FieldMap](name).values.head.asInstanceOf[T]
}
def optMultiLanguageField[T](name: String): Option[T] = {
fields.optField[FieldMap](name).flatMap(_.values.headOption).map(_.asInstanceOf[T])
}
}
implicit def HitToRichFieldMap(hit: Hit): RichFieldMap = {
new RichFieldMap(hit.sourceAsMap)
}
implicit object SearchResultHitReader extends HitReader[SearchResult] {
override def read(hit: Hit): Try[SearchResult] = {
val metadata = hit.field[FieldMap]("metadata")
val flag = hit.optField[String](flags)
val created = hit.optField[Long](createdAt)
val details = hit.field[String](`type`) match {
case "email" => readEmailResult(metadata)
case _ => readDocumentResult(metadata)
}
val highlights = hit match {
case searchHit: SearchHit => HighlightFields.parseHit(searchHit)
case _ => Seq.empty
}
val fieldWithMostHighlights = Try(
highlights
.groupBy(_.field)
.maxBy(_._2.length)
._1
).toOption
Success(SearchResult(hit.id, highlights, fieldWithMostHighlights, flag, created, details))
}
}
implicit object IndexedResourceHitReader extends HitReader[IndexedResource] {
override def read(hit: Hit): Try[IndexedResource] = {
hit.field[String](`type`) match {
case "blob" =>
val resource = readDocument(hit.id, hit.sourceAsMap)
val highlights = getHighlights(hit)
Success(
resource.copy(
text = highlights.flatMap(highlightedText(_, text)).getOrElse(resource.text),
ocr = highlightedOcr(highlights).orElse(resource.ocr),
transcript = highlightedTranscript(highlights, resource.transcript).orElse(resource.transcript)
)
)
case "email" =>
val resource = readEmail(hit.id, hit.sourceAsMap)
Success(resource.copy(body = getHighlights(hit).flatMap(highlightedText(_, text)).getOrElse(resource.body)))
case tpe =>
Failure(new IllegalStateException(s"Resource exists in index but has an invalid type $tpe"))
}
}
}
implicit object EventHitReader extends HitReader[Event] {
override def read(hit: Hit): Try[Event] = try {
val rawEventType = hit.field[String](EventFields.eventType)
val eventType = EventType.fromString(rawEventType)
val timestamp = hit.longField(EventFields.timestamp)
val description = hit.field[String](EventFields.description)
val tags = hit.nestedField(EventFields.tagsField).map { case(k, v) => k -> v.head }
Success(Event(eventType, timestamp, description, tags))
} catch {
case NonFatal(e) => Failure(e)
}
}
implicit object IndexedBlobHitReader extends HitReader[IndexedBlob] {
override def read(hit: Hit): Try[IndexedBlob] = try {
val ingestion = hit.setField[String](IndexFields.ingestion)
val collection = hit.setField[String](IndexFields.collection)
Success(IndexedBlob(uri = hit.id, collections = collection, ingestions = ingestion))
} catch {
case NonFatal(e) => Failure(e)
}
}
implicit object PageHitReader extends HitReader[Page] {
override def read(hit: Hit): Try[Page] = try {
val page = hit.longField(PagesFields.page)
val highlightValues = highlightedPageOcr(getHighlights(hit))
val notHighlightValues = hit.field[FieldMap](PagesFields.value).map { case(langKey, rawValue) =>
Languages.getByKeyOrThrow(langKey) -> rawValue.asInstanceOf[String]
}
val dimensions = readDimensions(hit)
// highlight values take precedent
Success(Page(page, notHighlightValues ++ highlightValues, dimensions))
} catch {
case NonFatal(e) => Failure(e)
}
def readDimensions(hit: Hit): PageDimensions = {
val width = hit.doubleField(s"${PagesFields.dimensions}.${PagesFields.width}")
val height = hit.doubleField(s"${PagesFields.dimensions}.${PagesFields.height}")
val top = hit.doubleField(s"${PagesFields.dimensions}.${PagesFields.top}")
val bottom = hit.doubleField(s"${PagesFields.dimensions}.${PagesFields.bottom}")
PageDimensions(width, height, top, bottom)
}
}
private def readFileUris(fields: FieldMap): List[String] = {
// Each item in the array is a multi-language value (eg { english: "test", portuguese: "test" })
fields.listField[FieldMap](metadata.fileUris).map(_.values.head.asInstanceOf[String])
}
private def readEmailResult(fields: FieldMap): EmailResultDetails = {
val from = fields.optField(metadata.fromField).map(readRecipient).getOrElse(Recipient.unknown)
val subject = fields.optMultiLanguageField[String](metadata.subject).getOrElse("<Unknown Subject>")
val attachmentCount = fields.optField[Int](metadata.attachmentCount).getOrElse(0)
EmailResultDetails(from, subject, attachmentCount)
}
private def readDocumentResult(fields: FieldMap): DocumentResultDetails = {
val paths = readFileUris(fields)
val mimeTypes = fields.listField[String](metadata.mimeTypes)
val fileSize = fields.optLongField(metadata.fileSize)
DocumentResultDetails(mimeTypes, paths, fileSize)
}
private def readEmail(id: String, fields: FieldMap): Email = {
val metadataMap = fields.field[FieldMap](metadataField)
Email(
uri = Uri(id),
body = fields.multiLanguageField[String](text),
from = metadataMap.optField[FieldMap](metadata.fromField).map(readRecipient),
recipients = metadataMap.optField[List[FieldMap]](metadata.recipientsField).getOrElse(Nil).map(readRecipient),
sentAt = metadataMap.optField[String](metadata.sentAt),
sensitivity = metadataMap.optEnumField(metadata.sensitivity, Sensitivity),
priority = metadataMap.optField[String](metadata.priority),
subject = metadataMap.optMultiLanguageField(metadata.subject).getOrElse("<Unknown Subject>"),
inReplyTo = metadataMap.listField[String](metadata.inReplyTo),
references = metadataMap.listField[String](metadata.references),
html = metadataMap.optMultiLanguageField[String](metadata.html),
attachmentCount = metadataMap.optField[Int](metadata.attachmentCount).getOrElse(0),
metadata = readMetadata(fields),
flag = fields.optField[String](flags)
)
}
private def readDocument(id: String, fields: FieldMap): Document = {
val metadataMap = fields.field[FieldMap](metadataField)
Document(
uri = Uri(id),
text = fields.optMultiLanguageField(text).getOrElse(""),
ocr = readOcr(fields),
transcript = readTranscript(fields),
enrichedMetadata = readEnrichedMetadata(fields),
flag = fields.optField[String](flags),
extracted = fields.optField[Boolean](extracted).getOrElse(false),
mimeTypes = metadataMap.setField[String](metadata.mimeTypes),
fileUris = readFileUris(metadataMap).toSet,
fileSize = metadataMap.optLongField(metadata.fileSize).getOrElse(0L),
metadata = readMetadata(fields)
)
}
private def readRecipient(fields: FieldMap): Recipient = Recipient(
fields.optMultiLanguageField(metadata.recipients.name),
fields.field(metadata.recipients.address)
)
private def readMetadata(fields: FieldMap): Map[String, Seq[String]] = {
fields.optField[FieldMap](metadataField).map(_.nestedField(metadata.extractedMetadataField)).getOrElse(Map.empty)
}
private def readOcr(fields: FieldMap): Option[Map[String, String]] = {
fields.optField[FieldMap](ocr).map { languages =>
languages.view.mapValues(_.asInstanceOf[String]).toMap
}
}
private def readTranscript(fields: FieldMap): Option[Map[String, String]] = {
fields.optField[FieldMap](transcript).map { languages =>
languages.view.mapValues(_.asInstanceOf[String]).toMap
}
}
private def readEnrichedMetadata(fields: FieldMap): Option[EnrichedMetadata] = {
val maybeMetadata = fields.optField[FieldMap](metadataField)
.flatMap(_.optField[FieldMap](metadata.enrichedMetadataField))
maybeMetadata.map { map =>
EnrichedMetadata(
map.optField[String](metadata.enrichedMetadata.title),
map.optField[String](metadata.enrichedMetadata.author),
map.optField[Long](metadata.enrichedMetadata.createdAt),
map.optField[Long](metadata.enrichedMetadata.lastModified),
map.optField[String](metadata.enrichedMetadata.createdWith),
map.optField[Int](metadata.enrichedMetadata.pageCount),
map.optField[Int](metadata.enrichedMetadata.wordCount)
)
}
}
private def getHighlights(hit: Hit): Option[Map[String, Seq[String]]] = hit match {
case searchHit: SearchHit => Some(Option(searchHit.highlight).getOrElse(Map.empty))
case _ => None
}
private def highlightedText(highlights: Map[String, Seq[String]], fieldName: String): Option[String] = {
highlights.collectFirst {
case (key, value :: _) if key.startsWith(fieldName) && value != "" => value
}
}
private def highlightedOcr(maybeHighlights: Option[Map[String, Seq[String]]]): Option[Map[String, String]] = {
maybeHighlights match {
case Some(highlights) if highlights.nonEmpty =>
val prefix = IndexFields.ocr + "."
Some(highlights.collect {
case (key, values) if key.startsWith(IndexFields.ocr) && values.nonEmpty =>
key.substring(prefix.length) -> values.head
})
case _ =>
None
}
}
private def highlightedTranscript(maybeHighlights: Option[Map[String, Seq[String]]], maybeTranscript: Option[Map[String, String]]): Option[Map[String, String]] = {
maybeHighlights match {
case Some(highlights) if highlights.nonEmpty =>
val prefix = IndexFields.transcript + "."
val highlightedLanguages = highlights.collect {
case (key, values) if key.startsWith(IndexFields.transcript) && values.nonEmpty =>
key.substring(prefix.length) -> values.head
}
// don't discard languages without matches
val nonHighlightedLanguages = maybeTranscript.map { transcript =>
transcript.collect {
case (key, value) if !highlightedLanguages.contains(key) =>
key -> value
}
}
Some(highlightedLanguages ++ nonHighlightedLanguages.getOrElse(Map()))
case _ =>
None
}
}
private def highlightedPageOcr(maybeHighlights: Option[Map[String, Seq[String]]]): Map[Language, String] = {
val highlights = maybeHighlights.getOrElse(Map.empty)
// We only expect a single highlight in each entry as we ask ES to highlight the whole document in the query
// ES will return "value.english" as the field name
highlights.collect { case(fieldName, highlight :: _) if fieldName.startsWith(PagesFields.value) =>
val langKey = fieldName.substring(PagesFields.value.length + 1)
val lang = Languages.getByKeyOrThrow(langKey)
lang -> highlight
}
}
}