app/redact/PdfRedactor.scala

package redact import scala.jdk.CollectionConverters._ import com.typesafe.config.ConfigFactory import java.io.{File, OutputStream} import java.awt.Color import org.apache.pdfbox.pdmodel.{PDDocument, PDPage, PDPageContentStream} import org.apache.pdfbox.pdmodel.common.PDRectangle import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory import org.apache.pdfbox.rendering.{ImageType, PDFRenderer} import play.api.Logger import scala.collection.Searching.Found object PdfRedactor { val config = ConfigFactory.load() val enableExactStringMatching = config.getBoolean("redacted-exact-strings.enabled") val enableGreedyNameMatching = config.getBoolean("greedy-name-match.enabled") val enableNewPageSplittingAndDeletion = config.getBoolean("new-page-split-behaviour.enabled") val redactStringsList = config.getStringList("redact.genderedwords.list").asScala.toList val commonNames = config.getStringList("redact.petnames.list").asScala.toList def splitCandidates(doc: PDDocument, candidates: List[Candidate]) = { val docs = new CustomSplitter(candidates.map(_.firstPage)).split(doc).asScala.toList val skipFirstPage = if (candidates.head.firstPage != 0) { docs.head.close() 1 } else 0 candidates.zip(docs.drop(skipFirstPage)) } def candidates(source: File): List[Candidate] = { val document = PDDocument.load(source) val candidates = TextFinder.analyse(document) document.close() candidates } def redact(source: File, destination: OutputStream, names: List[String]): Unit = { val document = PDDocument.load(source) redact(document, destination, names) document.close() } def redact(document: PDDocument, destination: OutputStream, names: List[String]): Unit = { val foundNames = for { name <- names ++ commonNames found <- TextFinder.findString(document, name) } yield found val regexedNames: List[FoundText] = { if (enableGreedyNameMatching) { names.flatMap(word => TextFinder.findStringsMatchingRegex(document, word)) } else{ List.empty[FoundText] } } val redactedWords: List[FoundText] = { if (enableExactStringMatching) { redactStringsList.flatMap(word => TextFinder.findString(document, word)) } else { List.empty[FoundText] } } redactFoundText( document = document, redactions = List( foundNames, redactedWords, TextFinder.findEmail(document), TextFinder.findUrl(document), TextFinder.findWebsite(document, "github.com"), TextFinder.findWebsite(document, "linkedin.com"), regexedNames, ).flatten ) ImageRedactor.redactImages(document) removeFirstPage(document) val rasterisedDoc = new PDDocument() val renderer = new PDFRenderer(document) for (page <- 0 until document.getNumberOfPages) { val image = renderer.renderImageWithDPI(page, 300, ImageType.RGB) val pdPage = new PDPage(PDRectangle.A4) val pdImage = JPEGFactory.createFromImage(rasterisedDoc, image) val contentStream = new PDPageContentStream(rasterisedDoc, pdPage) contentStream.drawImage(pdImage, 0, 0, PDRectangle.A4.getWidth, PDRectangle.A4.getHeight) contentStream.close() rasterisedDoc.addPage(pdPage) } rasterisedDoc.save(destination) rasterisedDoc.close() } def removeFirstPage(document: PDDocument) = if (enableNewPageSplittingAndDeletion) document.removePage(0) def redactFoundText(document: PDDocument, redactions: List[FoundText]): Unit = { val allPages = document.getDocumentCatalog.getPages redactions.groupBy(_.pageIndex).foreach { case (index, pageData) => val page = allPages.get(index) val contentStream = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, false, true) pageData.foreach({ case FoundText(_, x1, y1, x2, y2, _) => contentStream.setNonStrokingColor(Color.BLACK) val padding = ((y2 - y1) * 0.3).toFloat contentStream.addRect(x1, page.getBBox.getHeight - y1 - padding, x2 - x1, y2 - y1 + padding) contentStream.fill() }) contentStream.close() } } }

app/redact/PdfRedactor.scala (100 lines of code) (raw):