backend/app/utils/Ocr.scala (153 lines of code) (raw):

package utils import model.ingestion.{OcrMyPdfFlag, RedoOcr, SkipText} import java.nio.file.{Files, Path} import services.TesseractOcrConfig import utils.attempt.Failure import java.util.concurrent.TimeUnit import scala.annotation.tailrec import scala.collection.mutable import scala.concurrent.{Await, Future, TimeoutException, blocking, duration} import scala.concurrent.ExecutionContext.Implicits.global import scala.sys.process._ class OcrStderrLogger(setProgressNote: Option[String => Either[Failure, Unit]]) extends Logging { val LOG_THROTTLE_RATE_MILLIS = 5000 val acc = mutable.Buffer[String]() var lastLogTime: Option[Long] = None def append(line: String): Unit = { acc.append(line) logger.info(line) // Avoid spamming the database should the extractor output a lot of stderr logging quickly val now = System.currentTimeMillis() if(lastLogTime.isEmpty || lastLogTime.exists(t => (now - t) > LOG_THROTTLE_RATE_MILLIS)) { setProgressNote.foreach(f => f(line)) lastLogTime = Some(now) } } def getOutput: String = { acc.mkString("\n") } } object Ocr extends Logging { class OcrSubprocessCrashedException(exitCode: Int, stderr: String) extends Exception(s"Exit code: $exitCode: ${stderr}") object OcrSubprocessInterruptedException extends Exception("Ocr subprocess terminated externally") // from https://github.com/jbarlow83/OCRmyPDF/blob/4b8ccbe8cb76480b03ab42b0c61814acd1c59a60/docs/advanced.rst#return-code-policy object OcrMyPdfBadArgs extends Exception("Invalid arguments, exited with an error.") object OcrMyPdfInputFile extends Exception("The input file does not seem to be a valid PDF.") object OcrMyPdfMissingDependency extends Exception("An external program required by OCRmyPDF is missing.") object OcrMyPdfInvalidOutputPdf extends Exception("An output file was created, but it does not seem to be a valid PDF. The file will be available.") object OcrMyPdfFileAccessError extends Exception("The user running OCRmyPDF does not have sufficient permissions to read the input file and write the output file.") object OcrMyPdfAlreadyDoneOcr extends Exception("The file already appears to contain text so it may not need OCR. See output message.") object OcrMyPdfChildProcessError extends Exception("An error occurred in an external program (child process) and OCRmyPDF cannot continue.") object OcrMyPdfEncryptedPdf extends Exception("The input PDF is encrypted. OCRmyPDF does not read encrypted PDFs. Use another program such as qpdf to remove encryption.") object OcrMyPdfInvalidConfig extends Exception("A custom configuration file was forwarded to Tesseract using --tesseract-config, and Tesseract rejected this file.") object OcrMyPdfPdfaConversionFailed extends Exception("A valid PDF was created, PDF/A conversion failed. The file will be available.") object OcrMyPdfOtherError extends Exception("Some other error occurred.") object OcrMyPdfCtrlC extends Exception("The program was interrupted by pressing Ctrl+C.") case class OcrMyPdfTimeout(msg: String) extends Exception(msg) val TIMEOUT_DURATION_PER_PAGE = 1 val DEFAULT_TIMEOUT_IN_MINUTES = 300 val TIMEOUT_DURATION_OVERHEAD = 5 def invokeTesseractDirectly(lang: String, imageFileName: String, config: TesseractOcrConfig, stderr: OcrStderrLogger): String = { val cmd = s"tesseract $imageFileName stdout -l $lang --oem ${config.engineMode} --psm ${config.pageSegmentationMode}" val stdout = mutable.Buffer.empty[String] val exitCode = Process(cmd).!(ProcessLogger(stdout.append(_), stderr.append)) exitCode match { case 143 => // The worker was terminated midway through. Don't register this as a failure to allow another worker to pick it up throw OcrSubprocessInterruptedException case 0 => stdout.mkString("\n") case _ => throw new OcrSubprocessCrashedException(exitCode, stderr.getOutput) } } // Reduces the ppi of the images within the pdf to 300 if they are over this limit. // This improves the performance of ocrMyPdf but also reduced the chance of ocr failures // because ocrMyPdf can not handle images sizes over 500000000 pixels def preProcessPdf(inputFilePath: Path, tmpDir: Path, stderr: OcrStderrLogger): Option[Path] = { val tempDownSampledFile = tmpDir.resolve(s"${inputFilePath.getFileName}.downsampled.pdf") val cmd = new StringBuilder("gs ") cmd.append("-sDEVICE=pdfwrite ") cmd.append("-dDownsampleColorImages=true ") cmd.append("-dDownsampleGrayImages=true ") cmd.append("-dDownsampleMonoImages=true ") cmd.append("-dColorImageResolution=300 ") cmd.append("-dGrayImageResolution=300 ") cmd.append("-dMonoImageResolution=300 ") cmd.append(s"-o $tempDownSampledFile ") cmd.append(inputFilePath.toAbsolutePath) val exitCode = Process(cmd.toString()).!(ProcessLogger(stdout.append(_), stderr.append)) exitCode match { case 0 => Some(tempDownSampledFile) case _ => logger.warn(s"Failed to down sample the file ${inputFilePath.getFileName}. exit code ${exitCode} .") None } } // TODO MRB: allow OcrMyPdf to read DPI if set in metadata // OCRmyPDF is a wrapper for Tesseract that we use to overlay the OCR as a text layer in the resulting PDF def invokeOcrMyPdf(lang: String, inputFilePath: Path, dpi: Option[Int], stderr: OcrStderrLogger, tmpDir: Path, numberOfPages: Option[Int]): Path = { val tempFile = tmpDir.resolve(s"${inputFilePath.getFileName}.ocr.pdf") val stdout = mutable.Buffer.empty[String] // Timeout is added because ocrMyPdf may get stuck processing a file // and we don't want to block the worker forever def runProcessWithTimeout(processBuilder: ProcessBuilder): Int = { val process = processBuilder.run(ProcessLogger(stdout.append(_), stderr.append)) val future = Future(blocking(process.exitValue())) // timeout duration gives at most 1 minute to ocr every page plus an overall 5 minutes // to mitigate the risk of un-necessary timeout for files with low number of pages. // default 300 minutes is used in case the number of pages failed to // get retrieved from the file and the value was None at this stage val timeoutMinutes = if (numberOfPages.isDefined) { (numberOfPages.get * TIMEOUT_DURATION_PER_PAGE) + TIMEOUT_DURATION_OVERHEAD } else { logger.warn(s"Default timeout duration $DEFAULT_TIMEOUT_IN_MINUTES used since numberOfPages is None") DEFAULT_TIMEOUT_IN_MINUTES } try { Await.result(future, duration.Duration(timeoutMinutes, TimeUnit.MINUTES)) } catch { case _: TimeoutException => process.destroy() process.exitValue() throw OcrMyPdfTimeout(s"Timing out after ${timeoutMinutes} minutes") } } def ocrWithOcrMyPdf(flag: OcrMyPdfFlag, overrideFile: Option[Path] = None): Int = { val sourceFilePath = overrideFile.getOrElse(inputFilePath) val cmd = s"ocrmypdf ${flag.flag} -l $lang ${dpi.map(dpi => s"--image-dpi $dpi").getOrElse("")} ${sourceFilePath.toAbsolutePath} ${tempFile.toAbsolutePath}" val process = Process(cmd, cwd = None, extraEnv = "TMPDIR" -> tmpDir.toAbsolutePath.toString) runProcessWithTimeout(process) } def decryptWithQpdf(decryptTempFile: Path): Boolean = { val cmd = s"qpdf --decrypt ${inputFilePath.toAbsolutePath} ${decryptTempFile.toAbsolutePath}" val process = Process(cmd, cwd = None) val qpdfExitCode = process.!(ProcessLogger(stdout.append(_), stderr.append)) if (qpdfExitCode != 0) { logger.info(s"Failed to decrypt with qpdf (exit code ${qpdfExitCode} - file is likely encrypted with a user password.") } qpdfExitCode == 0 } /* * We may reattempt the ocrmypdf process with different flags/cleaned input files. This recursive function handles * the different posssible flows. * * @param flag option to pass my ocrmypdf - probably --redo-ocr or --skip-text * @param previousExitCode needed to prevent an infinite loop where ocrmypdf keeps failing with the same exit code * @param overrideFile used where a file has been e.g decrypted and we want to run ocrmypdf on the new file * @return final exit code */ @tailrec def process(flag: OcrMyPdfFlag, previousExitCode: Option[Int] = None, overrideFile: Option[Path] = None): Int = { val redoOcrExitCode = ocrWithOcrMyPdf(flag, overrideFile) redoOcrExitCode match { case 2 if !previousExitCode.contains(2) => // Exit code 2 from ocrmypdf is an input file error, we've noticed that this can be an error with --redo-ocr, and that // running with --skip-text instead results in success. For example, if a PDF has a user fillable form then it can't // be ocrd with --redo-ocr set. See https://github.com/guardian/giant/pull/68 for details of --skip-text vs --redo-ocr logger.info(s"Got input file error from ocrmypdf with --redo-ocr for ${inputFilePath.getFileName}, attempting with --skip-text") process(SkipText, Some(redoOcrExitCode), overrideFile) case 8 if !previousExitCode.contains(8) => // exit code 8 indicates that the file is encrypted. If it has a user password we can go no further, but if it only // has an 'owner' password we can remove the password protection with qpdf - see // https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html#password-protected-pdfs logger.info("PDF password protected, attempting to remove protection with qpdf") val decryptTempFile = tmpDir.resolve(s"${inputFilePath.getFileName}.decrypt.pdf") val qpdfResult = decryptWithQpdf(decryptTempFile) // If we managed to decrypt the file, have another go at running ocrmypdf if (qpdfResult) { process(RedoOcr, Some(redoOcrExitCode), Some(decryptTempFile)) } else { redoOcrExitCode } case _ => redoOcrExitCode } } val exitCode = process(RedoOcr) exitCode match { // 0: success // 4: "An output file was created, but it does not seem to be a valid PDF. The file will be available." // 10: "A valid PDF was created, PDF/A conversion failed. The file will be available." // These both produce an output file (they're more like warnings than failures) // so we want to return the file instead of throwing an exception. case 0 | 4 | 10 => tempFile case 1 => throw OcrMyPdfBadArgs case 2 => throw OcrMyPdfInputFile case 3 => throw OcrMyPdfMissingDependency case 5 => throw OcrMyPdfFileAccessError case 6 => throw OcrMyPdfAlreadyDoneOcr case 7 => throw OcrMyPdfChildProcessError case 8 => throw OcrMyPdfEncryptedPdf case 9 => throw OcrMyPdfInvalidConfig case 15 => throw OcrMyPdfOtherError case 130 => throw OcrMyPdfCtrlC // This default case will cover code 143 where worker was terminated midway through. // Don't register this as a failure to allow another worker to pick it up case _ => throw OcrSubprocessInterruptedException } } }