in backend/app/extraction/ocr/OcrMyPdfExtractor.scala [44:114]
override def extractOcr(blob: Blob, file: File, params: ExtractionParams, stdErrLogger: OcrStderrLogger): Unit = {
val tmpDir = scratch.createWorkingDir(s"ocrmypdf-tmp-${blob.uri.value}")
var pdDocuments: Map[Language, (Path, PDDocument)] = Map.empty
try {
pdDocuments = params.languages.map { lang =>
val pages = Try(PDDocument.load(file).getNumberOfPages).toOption
val preProcessPdf = Ocr.preProcessPdf(file.toPath, tmpDir, stdErrLogger)
val pdfPath = Ocr.invokeOcrMyPdf(lang.ocr, preProcessPdf.getOrElse(file.toPath), None, stdErrLogger, tmpDir, pages)
val pdfDoc = PDDocument.load(pdfPath.toFile)
lang -> (pdfPath, pdfDoc)
}.toMap
// All docs have the same number of pages with the same dimensions, just different text from the OCR run per language
val (_, (_, firstDoc)) = pdDocuments.head
val numberOfPages = firstDoc.getNumberOfPages
val base = (List.empty[Page], 0.0)
val (pages, _) = (1 to numberOfPages).foldLeft(base) { case ((pages, offsetHeight), pageNumber) =>
val page = firstDoc.getPage(pageNumber - 1)
val pageBoundingBox = page.getMediaBox
val dimensions = PageDimensions(
width = pageBoundingBox.getWidth,
height = pageBoundingBox.getHeight,
top = offsetHeight,
bottom = offsetHeight + pageBoundingBox.getHeight
)
val textByLanguage = pdDocuments.map { case (lang, (_, doc)) =>
assert(doc.getNumberOfPages == numberOfPages, s"Number of pages mismatch across languages: ${pdDocuments.view.mapValues(_._2.getNumberOfPages).toMap}")
val reader = new PDFTextStripper()
reader.setStartPage(pageNumber)
reader.setEndPage(pageNumber)
val text = reader.getText(doc)
lang -> text
}
(pages :+ Page(pageNumber, textByLanguage, dimensions), dimensions.bottom)
}
// Write to the page index in Elasticsearch - a document in the index corresponds to a single page
pageService.addPageContents(blob.uri, pages)
// Upload each page to S3, per language. This is because OCRing English produces totally different output to OCRing
// Russian for example so we store each page and decide later which one to serve the viewer
pdDocuments.foreach { case (lang, (path, doc)) =>
(1 to numberOfPages).foreach { pageNumber =>
val page = doc.getPage(pageNumber - 1)
uploadPageAsSeparatePdf(blob, lang, pageNumber, page, previewStorage)
}
// Upload the entire document to S3, per language. We serve these to the client as a download of the whole doc
// TODO MRB: stop overwriting when we are OCRing against multiple languages?
previewStorage.create(blob.uri.toStoragePath, path, Some("application/pdf"))
}
OcrMyPdfExtractor.insertFullText(blob.uri, pages, index)
} finally {
pdDocuments.foreach { case(_, (path, doc)) =>
doc.close()
Files.deleteIfExists(path)
}
FileUtils.deleteDirectory(tmpDir.toFile)
}
}