app/redact/TextFinder.scala

package redact import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.text.{PDFTextStripper, TextPosition} import scala.collection.mutable.ListBuffer import scala.util.matching.Regex import play.api.Logger import com.typesafe.config.ConfigFactory case class FoundText(pageIndex: Int, x1: Float, y1: Float, x2: Float, y2: Float, text: String) object TextFinder { def findStringsMatchingRegex(document: PDDocument, needle: String): List[FoundText] = { val textFinder = new RegexFinder(s"${needle.toLowerCase()}".r) textFinder.getText(document) textFinder.locations.result() } def findString(document: PDDocument, needle: String): List[FoundText] = { val textFinder = new TextFinder(needle) textFinder.getText(document) textFinder.locations.result() } def findEmail(document: PDDocument): List[FoundText] = { val textFinder = new RegexFinder("""([a-zA-Z0-9\.!#$%&'*+/=?^_`{|}~-]+)@([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)""".r) textFinder.getText(document) textFinder.locations.result() } def findWebsite(document: PDDocument, domain: String): List[FoundText] = { val textFinder = new RegexFinder(s"""$domain(\\/[^ ]*)?""".r) textFinder.getText(document) textFinder.locations.result() } def findUrl(document: PDDocument): List[FoundText] = { val textFinder = new RegexFinder("""http[s]?:\/\/([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)(\/[^ ]*)?""".r) textFinder.getText(document) textFinder.locations.result() } def findMultiple(needle: String, haystack: String): List[Int] = s"(\\b|_|\\.)$needle(\\b|_|\\.)".r.findAllMatchIn(haystack).toList.map(_.start) def analyse(doc: PDDocument) = { val textPrinter = new AnalyseCV() textPrinter.getText(doc) textPrinter.candidates.result().foldRight((doc.getNumberOfPages, List.empty[Candidate])) { case (candidate, (lastPage, candidates)) => (candidate.firstPage -1, candidate.copy(lastPage = lastPage) :: candidates) }._2 } } case class Candidate( firstName: String, lastName: String, id: String, jobText: String, jobId: String, firstPage: Int, lastPage: Int ) object AnalyseCV { val config = ConfigFactory.load() val newPageSplitBehaviour = config.getBoolean("new-page-split-behaviour.enabled") //Changing the splitDifference to two means that the first page of the redacted cv has the candidate's name and job applied for, //rather than having it pull in the next candidate's unredacted at the bottom val firstPageSplitDifference = { if (newPageSplitBehaviour) { 2 } else { 1 } } } class AnalyseCV() extends PDFTextStripper { val candidates: ListBuffer[Candidate] = new ListBuffer var potentialCandidate: Option[Candidate] = None var slidingWindow: List[String] = Nil super.setSortByPosition(true) val candidateNameRegex = """^([\p{L} ]*), ([\p{L} ]*) $(\d+)$ applied for job:(.*)$""".r val jobRegex = """^(.*) $(.*)$$""".r val fullRegex = """^([\p{L}- ]*), ([\p{L}- ]*) $(\d+)$ applied for job:(.*) $(.*)$$""".r def candidateFromMatch(m: Regex.Match) = { Candidate( firstName = m.group(2), lastName = m.group(1), id = m.group(3), jobText = m.group(4).trim, jobId = "", firstPage = getCurrentPageNo - AnalyseCV.firstPageSplitDifference, lastPage = getCurrentPageNo - 1 ) } override protected def writeString(text: String, textPositions: java.util.List[TextPosition]): Unit = { slidingWindow = text :: slidingWindow.take(2) val last3Lines = fullRegex.findFirstMatchIn(slidingWindow.take(3).reverse.mkString(" ")).map(candidateFromMatch) val last2Lines = fullRegex.findFirstMatchIn(slidingWindow.take(2).reverse.mkString(" ")).map(candidateFromMatch) val last1Line = fullRegex.findFirstMatchIn(slidingWindow.take(1).reverse.mkString(" ")).map(candidateFromMatch) (last3Lines orElse last2Lines orElse last1Line, potentialCandidate) match { case (Some(newCandidate), _) => potentialCandidate = Some(newCandidate) case (_, Some(candidate)) => candidates.append(candidate) potentialCandidate = None case _ => } } } class TextFinder(val needle: String) extends PDFTextStripper { val locations: ListBuffer[FoundText] = new ListBuffer super.setSortByPosition(true) override protected def writeString(text: String, textPositions: java.util.List[TextPosition]): Unit = { TextFinder.findMultiple(needle.toLowerCase, text.toLowerCase).foreach { index => val first = textPositions.get(index) val last = textPositions.get(index + needle.length - 1) locations.append( FoundText( pageIndex = getCurrentPageNo - 1, x1 = first.getX, y1 = first.getY, x2 = last.getX + last.getWidth, y2 = last.getY + last.getHeight, text ) ) } } } class RegexFinder(regex: Regex) extends PDFTextStripper { val locations: ListBuffer[FoundText] = new ListBuffer super.setSortByPosition(true) private def find(haystack: String): List[(Int, Int)] = regex.findAllMatchIn(haystack).toList.map({ m => (m.start, m.end) }) override protected def writeString(text: String, textPositions: java.util.List[TextPosition]): Unit = { find(text.toLowerCase).foreach { case (index, end) => val first = textPositions.get(index) val last = textPositions.get(Math.min(textPositions.size() - 1, end - 1)) locations.append( FoundText( pageIndex = getCurrentPageNo - 1, x1 = first.getX, y1 = first.getY, x2 = last.getX + last.getWidth, y2 = last.getY + last.getHeight, text ) ) } } }

app/redact/TextFinder.scala (141 lines of code) (raw):