backend/app/utils/PDFUtil.scala (91 lines of code) (raw):
package utils
import model.frontend.{HighlightableText, TextHighlight}
import model.index.{HighlightSpan, FindHighlight, PageHighlight, SearchHighlight}
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.common.PDRectangle
import org.apache.pdfbox.pdmodel.graphics.color.{PDColor, PDDeviceRGB}
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup
import org.apache.pdfbox.text.{PDFTextStripper, TextPosition}
import java.util
import scala.jdk.CollectionConverters._
import scala.math.{atan2, cos, sin, sqrt}
object PDFUtil {
val highlightColour = new PDColor(Array(253f, 182f, 0f), PDDeviceRGB.INSTANCE)
// PDFBox will issue newlines to approximate lines on the page. This makes it difficult to match up the
// highlighted text that comes back from Elasticsearch with offsets into the text when rendering the highlights.
// So we override the default functionality and ensure there is a maximum of one newline separating each line.
def getPageText(doc: PDDocument, pageNumber: Int): String = {
val textBuilder = new StringBuilder()
val textStripper = new PDFTextStripper() {
// Looking up the calling code within PDFTextStripper itself, this is called for each line
override def writeString(text: String, notUsedJavaTextPositions: util.List[TextPosition]): Unit = {
textBuilder.append(text)
textBuilder.append(" ")
}
}
textStripper.setStartPage(pageNumber)
textStripper.setEndPage(pageNumber)
textStripper.getText(doc)
textBuilder.toString()
}
// We still need to have a separator in ES between each line, otherwise it will treat the first word of the subsequent
// line as part of the last word of the preceding. I've used an either type here to make it clear what is a character
// position coming back from PDFBox and what is a newline we have added ourselves.
object NewlinePlaceholder
def getText(positions: List[Either[TextPosition, NewlinePlaceholder.type]]): String = positions.map {
case Left(pos) => pos.getUnicode
case Right(NewlinePlaceholder) => "\n"
}.mkString("")
def rectangleToQuadPoints(rekt: PDRectangle): Array[Float] = {
Array(
// top left
rekt.getLowerLeftX, rekt.getUpperRightY,
// bottom left
rekt.getUpperRightX, rekt.getUpperRightY,
// top right
rekt.getLowerLeftX, rekt.getLowerLeftY,
// bottom right
rekt.getUpperRightX, rekt.getLowerLeftY
)
}
// Old version
def getSearchResultHighlights(highlights: HighlightableText, singlePageDoc: PDDocument, pageNumber: Int): List[PageHighlight] = {
getSearchResultHighlights(highlights.highlights, singlePageDoc, pageNumber, false)
}
def getSearchResultHighlights(highlights: List[TextHighlight], singlePageDoc: PDDocument, pageNumber: Int, isFind: Boolean = false): List[PageHighlight] = {
var textPositions = List.empty[Either[TextPosition, NewlinePlaceholder.type]]
val textStripper = new PDFTextStripper() {
// Looking up the stack, this is called for each line
override def writeString(notUsedText: String, javaTextPositions: util.List[TextPosition]): Unit = {
textPositions = textPositions ++ javaTextPositions.asScala.map(Left(_))
textPositions :+= Right(NewlinePlaceholder)
}
}
// Will call writeString above with every line
textStripper.getText(singlePageDoc)
highlights.zipWithIndex.map { case(highlight, ix) =>
val startIdx = highlight.range.startCharacter
val endIdx = highlight.range.endCharacter - 1
// Split the existing text path into lines of characters
// TODO SC: Make a case class for this to help readability?
val highlightSpans: List[List[TextPosition]] = textPositions
.slice(startIdx, endIdx + 1)
.zipWithIndex
.foldLeft(List(List[TextPosition]()))((acc, currWithIndex) => {
currWithIndex._1 match {
// Regular character just append to the last span
case Left(pos) => acc.init :+ (acc.last :+ pos)
// If there's a newline push a new span
case Right(NewlinePlaceholder) => acc :+ List()
}
})
val spans = highlightSpans.flatMap { span =>
for {
// Sometimes the text stripper doesn't pull out matches correctly resulting in empty spans
startCharacter <- span.headOption
endCharacter <- span.lastOption
} yield {
// This coordinate system makes my head hurt
val x = startCharacter.getX
val y = startCharacter.getY
val x1 = endCharacter.getX
val y1 = endCharacter.getY
val dX = x1 - x
val dY = y1 - y
val width = sqrt(dX * dX + dY * dY) + endCharacter.getWidth
val height = span.maxBy(_.getFontSize).getFontSize
val rotation = atan2(dY, dX)
// The highlight is rendered slightly off, we need to move the position by about 0.75 * the max character height
// while also factoring in rotation. Here were getting the x/y offsets by rotating a Y axis unit vector and then
// multiplying the result by the offset magnitude. The Y axis has to be negated due to the coordinate spaces.
val offsetMagnitude = height * 0.75
val offsetX = offsetMagnitude * sin(rotation)
val offsetY = -offsetMagnitude * cos(rotation)
HighlightSpan(x + offsetX, y + offsetY, width, height, rotation)
}
}
if (isFind) {
FindHighlight(highlight.id, highlight.index, spans)
} else {
SearchHighlight(highlight.id, highlight.index, spans)
}
}
}
}