override def extractOcr()

in backend/app/extraction/ocr/OcrMyPdfExtractor.scala [44:114]


  override def extractOcr(blob: Blob, file: File, params: ExtractionParams, stdErrLogger: OcrStderrLogger): Unit = {
    val tmpDir = scratch.createWorkingDir(s"ocrmypdf-tmp-${blob.uri.value}")
    var pdDocuments: Map[Language, (Path, PDDocument)] = Map.empty

    try {
      pdDocuments = params.languages.map { lang =>
        val pages = Try(PDDocument.load(file).getNumberOfPages).toOption
        val preProcessPdf = Ocr.preProcessPdf(file.toPath, tmpDir, stdErrLogger)
        val pdfPath = Ocr.invokeOcrMyPdf(lang.ocr, preProcessPdf.getOrElse(file.toPath), None, stdErrLogger, tmpDir, pages)
        val pdfDoc = PDDocument.load(pdfPath.toFile)

        lang -> (pdfPath, pdfDoc)
      }.toMap

      // All docs have the same number of pages with the same dimensions, just different text from the OCR run per language
      val (_, (_, firstDoc)) = pdDocuments.head
      val numberOfPages = firstDoc.getNumberOfPages

      val base = (List.empty[Page], 0.0)

      val (pages, _) = (1 to numberOfPages).foldLeft(base) { case ((pages, offsetHeight), pageNumber) =>
        val page = firstDoc.getPage(pageNumber - 1)
        val pageBoundingBox = page.getMediaBox

        val dimensions = PageDimensions(
          width = pageBoundingBox.getWidth,
          height = pageBoundingBox.getHeight,
          top = offsetHeight,
          bottom = offsetHeight + pageBoundingBox.getHeight
        )

        val textByLanguage = pdDocuments.map { case (lang, (_, doc)) =>
          assert(doc.getNumberOfPages == numberOfPages, s"Number of pages mismatch across languages: ${pdDocuments.view.mapValues(_._2.getNumberOfPages).toMap}")

          val reader = new PDFTextStripper()
          reader.setStartPage(pageNumber)
          reader.setEndPage(pageNumber)

          val text = reader.getText(doc)
          lang -> text
        }

        (pages :+ Page(pageNumber, textByLanguage, dimensions), dimensions.bottom)
      }

      // Write to the page index in Elasticsearch - a document in the index corresponds to a single page
      pageService.addPageContents(blob.uri, pages)

      // Upload each page to S3, per language. This is because OCRing English produces totally different output to OCRing
      // Russian for example so we store each page and decide later which one to serve the viewer
      pdDocuments.foreach { case (lang, (path, doc)) =>
        (1 to numberOfPages).foreach { pageNumber =>
          val page = doc.getPage(pageNumber - 1)
          uploadPageAsSeparatePdf(blob, lang, pageNumber, page, previewStorage)
        }

        // Upload the entire document to S3, per language. We serve these to the client as a download of the whole doc
        // TODO MRB: stop overwriting when we are OCRing against multiple languages?
        previewStorage.create(blob.uri.toStoragePath, path, Some("application/pdf"))
      }

      OcrMyPdfExtractor.insertFullText(blob.uri, pages, index)
    } finally {
      pdDocuments.foreach { case(_, (path, doc)) =>
        doc.close()
        Files.deleteIfExists(path)
      }

      FileUtils.deleteDirectory(tmpDir.toFile)
    }
  }