in backend/app/utils/Ocr.scala [113:220]
def invokeOcrMyPdf(lang: String, inputFilePath: Path, dpi: Option[Int], stderr: OcrStderrLogger, tmpDir: Path, numberOfPages: Option[Int]): Path = {
val tempFile = tmpDir.resolve(s"${inputFilePath.getFileName}.ocr.pdf")
val stdout = mutable.Buffer.empty[String]
// Timeout is added because ocrMyPdf may get stuck processing a file
// and we don't want to block the worker forever
def runProcessWithTimeout(processBuilder: ProcessBuilder): Int = {
val process = processBuilder.run(ProcessLogger(stdout.append(_), stderr.append))
val future = Future(blocking(process.exitValue()))
// timeout duration gives at most 1 minute to ocr every page plus an overall 5 minutes
// to mitigate the risk of un-necessary timeout for files with low number of pages.
// default 300 minutes is used in case the number of pages failed to
// get retrieved from the file and the value was None at this stage
val timeoutMinutes = if (numberOfPages.isDefined) {
(numberOfPages.get * TIMEOUT_DURATION_PER_PAGE) + TIMEOUT_DURATION_OVERHEAD
} else {
logger.warn(s"Default timeout duration $DEFAULT_TIMEOUT_IN_MINUTES used since numberOfPages is None")
DEFAULT_TIMEOUT_IN_MINUTES
}
try {
Await.result(future, duration.Duration(timeoutMinutes, TimeUnit.MINUTES))
} catch {
case _: TimeoutException =>
process.destroy()
process.exitValue()
throw OcrMyPdfTimeout(s"Timing out after ${timeoutMinutes} minutes")
}
}
def ocrWithOcrMyPdf(flag: OcrMyPdfFlag, overrideFile: Option[Path] = None): Int = {
val sourceFilePath = overrideFile.getOrElse(inputFilePath)
val cmd = s"ocrmypdf ${flag.flag} -l $lang ${dpi.map(dpi => s"--image-dpi $dpi").getOrElse("")} ${sourceFilePath.toAbsolutePath} ${tempFile.toAbsolutePath}"
val process = Process(cmd, cwd = None, extraEnv = "TMPDIR" -> tmpDir.toAbsolutePath.toString)
runProcessWithTimeout(process)
}
def decryptWithQpdf(decryptTempFile: Path): Boolean = {
val cmd = s"qpdf --decrypt ${inputFilePath.toAbsolutePath} ${decryptTempFile.toAbsolutePath}"
val process = Process(cmd, cwd = None)
val qpdfExitCode = process.!(ProcessLogger(stdout.append(_), stderr.append))
if (qpdfExitCode != 0) {
logger.info(s"Failed to decrypt with qpdf (exit code ${qpdfExitCode} - file is likely encrypted with a user password.")
}
qpdfExitCode == 0
}
/*
* We may reattempt the ocrmypdf process with different flags/cleaned input files. This recursive function handles
* the different posssible flows.
*
* @param flag option to pass my ocrmypdf - probably --redo-ocr or --skip-text
* @param previousExitCode needed to prevent an infinite loop where ocrmypdf keeps failing with the same exit code
* @param overrideFile used where a file has been e.g decrypted and we want to run ocrmypdf on the new file
* @return final exit code
*/
@tailrec
def process(flag: OcrMyPdfFlag, previousExitCode: Option[Int] = None, overrideFile: Option[Path] = None): Int = {
val redoOcrExitCode = ocrWithOcrMyPdf(flag, overrideFile)
redoOcrExitCode match {
case 2 if !previousExitCode.contains(2) =>
// Exit code 2 from ocrmypdf is an input file error, we've noticed that this can be an error with --redo-ocr, and that
// running with --skip-text instead results in success. For example, if a PDF has a user fillable form then it can't
// be ocrd with --redo-ocr set. See https://github.com/guardian/giant/pull/68 for details of --skip-text vs --redo-ocr
logger.info(s"Got input file error from ocrmypdf with --redo-ocr for ${inputFilePath.getFileName}, attempting with --skip-text")
process(SkipText, Some(redoOcrExitCode), overrideFile)
case 8 if !previousExitCode.contains(8) =>
// exit code 8 indicates that the file is encrypted. If it has a user password we can go no further, but if it only
// has an 'owner' password we can remove the password protection with qpdf - see
// https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html#password-protected-pdfs
logger.info("PDF password protected, attempting to remove protection with qpdf")
val decryptTempFile = tmpDir.resolve(s"${inputFilePath.getFileName}.decrypt.pdf")
val qpdfResult = decryptWithQpdf(decryptTempFile)
// If we managed to decrypt the file, have another go at running ocrmypdf
if (qpdfResult) {
process(RedoOcr, Some(redoOcrExitCode), Some(decryptTempFile))
} else {
redoOcrExitCode
}
case _ => redoOcrExitCode
}
}
val exitCode = process(RedoOcr)
exitCode match {
// 0: success
// 4: "An output file was created, but it does not seem to be a valid PDF. The file will be available."
// 10: "A valid PDF was created, PDF/A conversion failed. The file will be available."
// These both produce an output file (they're more like warnings than failures)
// so we want to return the file instead of throwing an exception.
case 0 | 4 | 10 => tempFile
case 1 => throw OcrMyPdfBadArgs
case 2 => throw OcrMyPdfInputFile
case 3 => throw OcrMyPdfMissingDependency
case 5 => throw OcrMyPdfFileAccessError
case 6 => throw OcrMyPdfAlreadyDoneOcr
case 7 => throw OcrMyPdfChildProcessError
case 8 => throw OcrMyPdfEncryptedPdf
case 9 => throw OcrMyPdfInvalidConfig
case 15 => throw OcrMyPdfOtherError
case 130 => throw OcrMyPdfCtrlC
// This default case will cover code 143 where worker was terminated midway through.
// Don't register this as a failure to allow another worker to pick it up
case _ => throw OcrSubprocessInterruptedException
}
}