backend/app/services/index/Pages2.scala (115 lines of code) (raw):

package services.index import com.sksamuel.elastic4s.ElasticClient import com.sksamuel.elastic4s.ElasticDsl._ import services.index.HitReaders.{HitToRichFieldMap, PageHitReader, RichFieldMap} import com.sksamuel.elastic4s.requests.searches.queries.Query import com.sksamuel.elastic4s.requests.searches.{HighlightField, MultisearchResponseItem, SearchRequest} import model.Uri import model.frontend.Highlight import model.index.{Page, PageWithFind} import services.ElasticsearchSyntax import utils.Logging import utils.attempt.{Attempt, ElasticSearchQueryFailure, MultipleFailures, NotFoundFailure} import scala.concurrent.ExecutionContext class Pages2(val client: ElasticClient, indexNamePrefix: String)(implicit val ex: ExecutionContext) extends ElasticsearchSyntax with Logging { val textIndexName = s"$indexNamePrefix-text" private def firstPageExistsInNewIdFormat(uri: Uri): Attempt[Boolean] = { // Only count documents whose id is of the format `{documentHash}-{pageNumber}`, // to avoid telling the frontend to try and render documents that were uploaded before // the id format changed in https://github.com/guardian/pfi/pull/884 and https://github.com/guardian/pfi/pull/886 execute { count(textIndexName).query( termQuery("_id", s"${uri.value}-1") ) }.map { resp => resp.count > 0 } } private def pageCount(uri: Uri): Attempt[Long] = { execute { count(textIndexName).query( termQuery(PagesFields.resourceId, uri.value) ) }.map { resp => resp.count } } def getPageCount(uri: Uri): Attempt[Long] = { for { hasPages <- firstPageExistsInNewIdFormat(uri) count <- pageCount(uri) } yield { if (!hasPages) { 0 } else { count } } } // Get geometries for a given page (page geometry and highlights) def getPageGeometries(uri: Uri, pageNumber: Int, searchQuery: Option[String], findQuery: Option[String]): Attempt[PageWithFind] = { val searchHighlightFields = buildHighlightFields(searchQuery) val findHighlightFields = buildHighlightFields(findQuery) val indexId = s"${uri.value}-$pageNumber" val queries = List( Some( search(textIndexName) .termQuery("_id", indexId) .highlighting(searchHighlightFields) ), findQuery.map { _ => search(textIndexName) .termQuery("_id", indexId) .highlighting(findHighlightFields) } ).flatten execute { multi ( queries ) }.flatMap { response => val results = response.items.collect { case MultisearchResponseItem(_, _, Right(result)) => result } val errors = response.items.collect { case MultisearchResponseItem(_, status, Left(err)) => ElasticSearchQueryFailure(new IllegalStateException(err.toString), status, None) } if(errors.nonEmpty) { Attempt.Left(MultipleFailures(errors.toList)) } else { val pages = results.flatMap(_.to[Page]) val page = pages.headOption page match { case None => Attempt.Left(NotFoundFailure(s"No page found in elasticsearch with id ${indexId}")) case Some(page) => // This is attempting to get the second element from the results, // which will contain find highlights if they were requested. val pageWithFindHighlights = pages.lift(1) Attempt.Right( PageWithFind(page.page, page.value, pageWithFindHighlights.map(_.value), page.dimensions) ) } } } } // This function is used to search within the page index to find highlights for a given query // it can be reused for find search and for regular highlighting. def findInPages(uri: Uri, findQuery: String): Attempt[Seq[Int]] = { val query = buildQuery(findQuery) val documentFilter = termQuery(PagesFields.resourceId, uri.value) execute { search(textIndexName) .size(501) .query( must(query).filter( documentFilter, ) ) }.flatMap { response => // TODO should really be a map of language -> page matches val matchingPages: Seq[Int] = response.hits.hits.map(_.field[Int](PagesFields.page)).distinct.sorted.toIndexedSeq Attempt.Right(matchingPages) } } private def buildQuery(query: String) = queryStringQuery(query) .defaultOperator("and") .field(s"${PagesFields.value}.*") .quoteFieldSuffix(".exact") private def buildHighlightFields(query: Option[String]) = query.map(buildQuery).toList.flatMap { query => HighlightFields.languageHighlighters(PagesFields.value, query) // Ensure we get the whole page, not just the highlights .map(_.numberOfFragments(0)) } }