backend/app/services/index/HighlightFields.scala (106 lines of code) (raw):

package services.index import com.sksamuel.elastic4s.requests.searches.queries.QueryStringQuery import com.sksamuel.elastic4s.requests.searches.{HighlightField, InnerHit, SearchHit} import com.sksamuel.elastic4s.ElasticDsl._ import model.Languages import model.frontend.Highlight object HighlightFields { import IndexFields._ private val defaultMappings: Map[String, String] = Map( text -> "Body Text", ocr -> "OCR Text", transcript -> "Transcript Text", metadataField + "." + metadata.subject -> "Email Subject", metadataField + "." + metadata.fromField + "." + metadata.recipients.name -> "Email From", metadataField + "." + metadata.fromField + "." + metadata.recipients.address -> "Email From", metadataField + "." + metadata.recipientsField + "." + metadata.recipients.name -> "Email Recipient", metadataField + "." + metadata.recipientsField + "." + metadata.recipients.address -> "Email Recipient", metadataField + "." + metadata.fileUris -> "File Path", metadataField + "." + metadata.html -> "Email HTML", metadataField + "." + metadata.mimeTypes -> "Mime Type" ) private val MAX_RESULTS = 5 private val enrichedMetadataFieldPrefix = metadataField + "." + metadata.enrichedMetadataField + "." private val metadataFieldPrefix = metadataField + "." private val fieldPrefixes = List(enrichedMetadataFieldPrefix, metadataFieldPrefix) // This list of highlighters should be kept in sync with the list of fields we apply a search to in // ElasticsearchResources.buildQuery. Raw metadata highlighting is specified there too as it is a nested field // // // We don't want to highlight all top level fields as some are non-human readable (like workspace ID) def searchHighlights(topLevelSearchQuery: QueryStringQuery): List[HighlightField] = { textHighlighters(topLevelSearchQuery) ++ languageHighlighters(s"${IndexFields.metadataField}.${IndexFields.metadata.fileUris}", topLevelSearchQuery) ++ // use the simpler highlighter as the field is small so there's no need for positions_with_offsets) List(highlighter(s"${IndexFields.metadataField}.${IndexFields.metadata.mimeTypes}") .highlighterType("unified")) ++ languageHighlighters(s"${IndexFields.metadataField}.${IndexFields.metadata.fromField}.${IndexFields.metadata.from.name}", topLevelSearchQuery) ++ List(highlighter(s"${IndexFields.metadataField}.${IndexFields.metadata.fromField}.${IndexFields.metadata.from.address}")) ++ languageHighlighters(s"${IndexFields.metadataField}.${IndexFields.metadata.recipientsField}.${IndexFields.metadata.from.name}", topLevelSearchQuery) ++ List(highlighter(s"${IndexFields.metadataField}.${IndexFields.metadata.recipientsField}.${IndexFields.metadata.from.address}")) ++ languageHighlighters(s"${IndexFields.metadataField}.${IndexFields.metadata.subject}", topLevelSearchQuery) ++ languageHighlighters(s"${IndexFields.metadataField}.${IndexFields.metadata.html}", topLevelSearchQuery) } def textHighlighters(topLevelSearchQuery: QueryStringQuery): List[HighlightField] = { val textFieldHighlighters = languageHighlighters(IndexFields.text, topLevelSearchQuery) val ocrFieldHighlighters = languageHighlighters(IndexFields.ocr, topLevelSearchQuery) val transcriptFieldHighlighters = languageHighlighters(IndexFields.transcript, topLevelSearchQuery) textFieldHighlighters ++ ocrFieldHighlighters ++ transcriptFieldHighlighters } def parseHit(hit: SearchHit): Seq[Highlight] = { val topLevelFields = Option(hit.highlight).getOrElse(Map.empty) val innerHitFields = getHighlightFields(hit.innerHits.values.flatMap(_.hits)) val highlightFields = deduplicateHighlightsByLanguage(topLevelFields ++ innerHitFields) highlightFields .flatMap { case(k, v) => getHighlights(k, v) } .toSeq .take(MAX_RESULTS) } def highlighter(fieldName: String): HighlightField = { highlight(fieldName) .order("score") .highlighterType("fvh") .preTag("<result-highlight>").postTag("</result-highlight>") } def singleLanguageHighlighter(fieldName: String, topLevelSearchQuery: QueryStringQuery): HighlightField = { highlighter(fieldName) .matchedFields(fieldName, s"${fieldName}.exact") // Exact matching doesn't seem to work for highlights in a query that also includes a nested query (and inner hits) // as we do for metadata. Manually specifying the highlight query seems to fix this <shrug> .query(topLevelSearchQuery.field(fieldName)) } def languageHighlighters(fieldName: String, topLevelSearchQuery: QueryStringQuery): List[HighlightField] = { Languages.all.map { language => singleLanguageHighlighter(s"${fieldName}.${language.key}", topLevelSearchQuery) } } private def getHighlightFields(innerHits: Iterable[InnerHit]): Map[String, Seq[String]] = { innerHits.foldLeft(Map.empty[String, Seq[String]]) { (acc, innerHit) => innerHit.source.get("key") match { case Some(key: String) => val highlightFields = innerHit.highlight.map { case(_, v) => key -> v } acc ++ highlightFields case _ => acc } } } private def deduplicateHighlightsByLanguage(highlights: Map[String, Seq[String]]): Map[String, Seq[String]] = { // We index some values using language specific analysers, with a separate object field for each language. // For all values except OCR (which has genuinely different values as they come from separately trained ML models) // we want to just pick one of them to avoid confusing users. This only affects documents that have been processed // as multiple languages. The vast majority are processed using a single language. highlights.foldLeft(Map.empty[String, Seq[String]]) { case (acc, (key, values)) if key.startsWith(IndexFields.ocr) || key.startsWith(IndexFields.transcript) => acc + (key -> values) case (acc, (key, values)) => Languages.all.find { lang => key.endsWith(s".${lang.key}") } match { case Some(lang) => val modifiedKey = key.substring(0, key.length - s".${lang.key}".length) acc + (modifiedKey -> values) case None => acc + (key -> values) } } } private def getHighlights(field: String, highlight: Seq[String]): List[Highlight] = { val maybeDisplayName = defaultMappings.collectFirst { case(k, v) if field.startsWith(k) => v } val displayName = maybeDisplayName.getOrElse { // sort by length so we are not dependent on the ordering of the prefixes in the list fieldPrefixes.filter(field.startsWith).sortBy(_.length).lastOption match { case Some(prefix) => field.substring(prefix.length) case None => field } } highlight.map(Highlight(field, displayName, _)).toList } }