backend/app/commands/GetPages.scala (67 lines of code) (raw):
package commands
import model.frontend.HighlightableText
import model.{Language, Uri}
import model.index.{FrontendPage, FrontendPageResult, Page, PageHighlight, PageResult}
import org.apache.pdfbox.pdmodel.PDDocument
import services.ObjectStorage
import services.index.Pages
import services.previewing.PreviewService
import utils.PDFUtil
import utils.attempt.{Attempt, IllegalStateFailure}
import java.io.InputStream
import scala.concurrent.ExecutionContext
class GetPages(uri: Uri, top: Double, bottom: Double, query: Option[String], userRequestedLanguage: Option[Language],
pagesService: Pages, previewStorage: ObjectStorage)(implicit ec: ExecutionContext) extends AttemptCommand[FrontendPageResult] {
override def process(): Attempt[FrontendPageResult] = {
for {
pages <- pagesService.getTextPages(uri, top, bottom, query)
response <- addSearchHighlightsToResponse(pages, uri, userRequestedLanguage)
} yield {
response
}
}
private def addSearchHighlightsToResponse(result: PageResult, uri: Uri, userRequestedLanguage: Option[Language]): Attempt[FrontendPageResult] = {
val frontendPages: List[Attempt[FrontendPage]] = result.pages.map { page =>
val pageNumber = page.page.toInt
val allLanguages = page.value.keySet
for {
metadata <- GetPages.getPagePreviewMetadata(uri, page, userRequestedLanguage)
previewUri = PreviewService.getPageStoragePath(uri, metadata.language, pageNumber)
pagePreviewPdf <- previewStorage.get(previewUri).toAttempt
highlights <- if(metadata.hasHighlights) {
addSearchHighlightsToPageResponse(pageNumber, pagePreviewPdf, metadata.pageText)
} else {
pagePreviewPdf.close()
Attempt.Right(List.empty)
}
} yield {
// FIXME? Why don't we just return the PDF data here? This would save the
FrontendPage(pageNumber, metadata.language, allLanguages, page.dimensions, highlights)
}
}
Attempt.sequence(frontendPages).map(FrontendPageResult(result.summary, _))
}
private def addSearchHighlightsToPageResponse(pageNumber: Int, pageData: InputStream, pageText: String): Attempt[List[PageHighlight]] = Attempt.catchNonFatalBlasé {
try {
val pagePDF = PDDocument.load(pageData)
val highlightableText = HighlightableText.fromString(pageText, Some(pageNumber), isFind = false)
PDFUtil.getSearchResultHighlights(highlightableText, pagePDF, pageNumber)
} finally {
pageData.close()
}
}
}
object GetPages {
case class PagePreviewMetadata(language: Language, pageText: String, hasHighlights: Boolean)
// TODO SC/JS: This name is a bit wrong - its not simply metadata, it's language choice and *actual* data
def getPagePreviewMetadata(uri: Uri, page: Page, userRequestedLanguage: Option[Language]): Attempt[PagePreviewMetadata] = {
val pageNumber = page.page.toInt
val allLanguages = page.value.keySet
// OcrMyPdfExtractor will have uploaded a PDF for each page of the document, per language requested
// We need to decide which of these we will try use to generate rectangles for the highlights coming back from ES.
// - If the page is only indexed using one language, use the page for that language
// - If we only have highlights from ES for one language, use the page for that one
// Otherwise just pick one. We might get it wrong, in which case the user will request a different language in the UI.
val languageWithHighlights = page.value
.collectFirst { case (lang, highlightedText) if highlightedText.contains("<result-highlight>") => lang }
userRequestedLanguage.orElse(languageWithHighlights).orElse(page.value.keys.headOption) match {
case Some(language) =>
Attempt.Right(PagePreviewMetadata(language, page.value(language), languageWithHighlights.nonEmpty))
case _ =>
Attempt.Left(IllegalStateFailure(s"Unable to determine language for highlights for page $pageNumber of $uri. Languages: $allLanguages"))
}
}
}