def invokeOcrMyPdf()

in backend/app/utils/Ocr.scala [113:220]


  def invokeOcrMyPdf(lang: String, inputFilePath: Path, dpi: Option[Int], stderr: OcrStderrLogger, tmpDir: Path, numberOfPages: Option[Int]): Path = {
    val tempFile = tmpDir.resolve(s"${inputFilePath.getFileName}.ocr.pdf")
    val stdout = mutable.Buffer.empty[String]


    // Timeout is added because ocrMyPdf may get stuck processing a file
    // and we don't want to block the worker forever
    def runProcessWithTimeout(processBuilder: ProcessBuilder): Int = {
      val process = processBuilder.run(ProcessLogger(stdout.append(_), stderr.append))
      val future = Future(blocking(process.exitValue()))
      // timeout duration gives at most 1 minute to ocr every page plus an overall 5 minutes
      // to mitigate the risk of un-necessary timeout for files with low number of pages.
      // default 300 minutes is used in case the number of pages failed to
      // get retrieved from the file and the value was None at this stage
      val timeoutMinutes  = if (numberOfPages.isDefined) {
        (numberOfPages.get * TIMEOUT_DURATION_PER_PAGE) + TIMEOUT_DURATION_OVERHEAD
      } else {
        logger.warn(s"Default timeout duration $DEFAULT_TIMEOUT_IN_MINUTES used since numberOfPages is None")
        DEFAULT_TIMEOUT_IN_MINUTES
      }
      try {
        Await.result(future, duration.Duration(timeoutMinutes, TimeUnit.MINUTES))
      } catch {
        case _: TimeoutException =>
          process.destroy()
          process.exitValue()
          throw OcrMyPdfTimeout(s"Timing out after ${timeoutMinutes} minutes")
      }
    }

    def ocrWithOcrMyPdf(flag: OcrMyPdfFlag, overrideFile: Option[Path] = None): Int = {
      val sourceFilePath = overrideFile.getOrElse(inputFilePath)
      val cmd = s"ocrmypdf ${flag.flag} -l $lang ${dpi.map(dpi => s"--image-dpi $dpi").getOrElse("")} ${sourceFilePath.toAbsolutePath} ${tempFile.toAbsolutePath}"
      val process = Process(cmd, cwd = None, extraEnv = "TMPDIR" -> tmpDir.toAbsolutePath.toString)
      runProcessWithTimeout(process)
    }

    def decryptWithQpdf(decryptTempFile: Path): Boolean = {
      val cmd = s"qpdf --decrypt ${inputFilePath.toAbsolutePath} ${decryptTempFile.toAbsolutePath}"
      val process = Process(cmd, cwd = None)
      val qpdfExitCode = process.!(ProcessLogger(stdout.append(_), stderr.append))
      if (qpdfExitCode != 0) {
        logger.info(s"Failed to decrypt with qpdf (exit code ${qpdfExitCode} - file is likely encrypted with a user password.")
      }
      qpdfExitCode == 0
    }

/*
  * We may reattempt the ocrmypdf process with different flags/cleaned input files. This recursive function handles
  * the different posssible flows.
  *
  * @param flag             option to pass my ocrmypdf - probably --redo-ocr or --skip-text
  * @param previousExitCode needed to prevent an infinite loop where ocrmypdf keeps failing with the same exit code
  * @param overrideFile     used where a file has been e.g decrypted and we want to run ocrmypdf on the new file
  * @return final exit code
  */
    @tailrec
    def process(flag: OcrMyPdfFlag, previousExitCode: Option[Int] = None, overrideFile: Option[Path] = None): Int = {
      val redoOcrExitCode = ocrWithOcrMyPdf(flag, overrideFile)
      redoOcrExitCode match {
        case 2 if !previousExitCode.contains(2) =>
          // Exit code 2 from ocrmypdf is an input file error, we've noticed that this can be an error with --redo-ocr, and that
          // running with --skip-text instead results in success. For example, if a PDF has a user fillable form then it can't
          // be ocrd with --redo-ocr set. See https://github.com/guardian/giant/pull/68 for details of --skip-text vs --redo-ocr
          logger.info(s"Got input file error from ocrmypdf with --redo-ocr for ${inputFilePath.getFileName}, attempting with --skip-text")
          process(SkipText, Some(redoOcrExitCode), overrideFile)
        case 8 if !previousExitCode.contains(8) =>
          // exit code 8 indicates that the file is encrypted. If it has a user password we can go no further, but if it only
          // has an 'owner' password we can remove the password protection with qpdf - see
          // https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html#password-protected-pdfs
          logger.info("PDF password protected, attempting to remove protection with qpdf")
          val decryptTempFile = tmpDir.resolve(s"${inputFilePath.getFileName}.decrypt.pdf")
          val qpdfResult = decryptWithQpdf(decryptTempFile)
          // If we managed to decrypt the file, have another go at running ocrmypdf
          if (qpdfResult) {
            process(RedoOcr, Some(redoOcrExitCode), Some(decryptTempFile))
          } else {
            redoOcrExitCode
          }
        case _ => redoOcrExitCode

      }
    }

    val exitCode = process(RedoOcr)

    exitCode match {
      // 0: success
      // 4: "An output file was created, but it does not seem to be a valid PDF. The file will be available."
      // 10: "A valid PDF was created, PDF/A conversion failed. The file will be available."
      // These both produce an output file (they're more like warnings than failures)
      // so we want to return the file instead of throwing an exception.
      case 0 | 4 | 10 => tempFile
      case 1 => throw OcrMyPdfBadArgs
      case 2 => throw OcrMyPdfInputFile
      case 3 => throw OcrMyPdfMissingDependency
      case 5 => throw OcrMyPdfFileAccessError
      case 6 => throw OcrMyPdfAlreadyDoneOcr
      case 7 => throw OcrMyPdfChildProcessError
      case 8 => throw OcrMyPdfEncryptedPdf
      case 9 => throw OcrMyPdfInvalidConfig
      case 15 => throw OcrMyPdfOtherError
      case 130 => throw OcrMyPdfCtrlC
      // This default case will cover code 143 where worker was terminated midway through.
      // Don't register this as a failure to allow another worker to pick it up
      case _ => throw OcrSubprocessInterruptedException
    }
  }