in pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java [494:727]
protected void writePage() throws IOException
{
float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE;
float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
PositionWrapper lastPosition = null;
PositionWrapper lastLineStartPosition = null;
boolean startOfPage = true; // flag to indicate start of page
boolean startOfArticle;
if (!charactersByArticle.isEmpty())
{
writePageStart();
}
for (List<TextPosition> textList : charactersByArticle)
{
if (getSortByPosition())
{
TextPositionComparator comparator = new TextPositionComparator();
// because the TextPositionComparator is not transitive, but
// JDK7+ enforces transitivity on comparators, we need to use
// a custom mergesort implementation (which is slower, unfortunately).
try
{
textList.sort(comparator);
}
catch (IllegalArgumentException e)
{
IterativeMergeSort.sort(textList, comparator);
}
// PDFBOX-5487: Remove all space characters if contained within the adjacent letters
removeContainedSpaces(textList);
}
startArticle();
startOfArticle = true;
// Now cycle through to print the text.
// We queue up a line at a time before we print so that we can convert
// the line from presentation form to logical form (if needed).
List<LineItem> line = new ArrayList<>();
Iterator<TextPosition> textIter = textList.iterator();
// PDF files don't always store spaces. We will need to guess where we should add
// spaces based on the distances between TextPositions. Historically, this was done
// based on the size of the space character provided by the font. In general, this
// worked but there were cases where it did not work. Calculating the average character
// width and using that as a metric works better in some cases but fails in some cases
// where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
// these examples.
// Keeps track of the previous average character width
float previousAveCharWidth = -1;
while (textIter.hasNext())
{
TextPosition position = textIter.next();
PositionWrapper current = new PositionWrapper(position);
String characterValue = position.getUnicode();
// PDFBOX-3774: conditionally ignore spaces from the content stream
if (" ".equals(characterValue) && getIgnoreContentStreamSpaceGlyphs())
{
continue;
}
// Resets the average character width when we see a change in font
// or a change in the font size
if (lastPosition != null
&& hasFontOrSizeChanged(position, lastPosition.getTextPosition()))
{
previousAveCharWidth = -1;
}
float positionX;
float positionY;
float positionWidth;
float positionHeight;
// If we are sorting, then we need to use the text direction
// adjusted coordinates, because they were used in the sorting.
if (getSortByPosition())
{
positionX = position.getXDirAdj();
positionY = position.getYDirAdj();
positionWidth = position.getWidthDirAdj();
positionHeight = position.getHeightDir();
}
else
{
positionX = position.getX();
positionY = position.getY();
positionWidth = position.getWidth();
positionHeight = position.getHeight();
}
// The current amount of characters in a word
int wordCharCount = position.getIndividualWidths().length;
// Estimate the expected width of the space based on the
// space character with some margin.
float wordSpacing = position.getWidthOfSpace();
float deltaSpace;
if (Float.compare(wordSpacing, 0) == 0 || Float.isNaN(wordSpacing))
{
deltaSpace = Float.MAX_VALUE;
}
else
{
if (lastWordSpacing < 0)
{
deltaSpace = wordSpacing * getSpacingTolerance();
}
else
{
deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
}
}
// Estimate the expected width of the space based on the average character width
// with some margin. This calculation does not make a true average (average of
// averages) but we found that it gave the best results after numerous experiments.
// Based on experiments we also found that .3 worked well.
float averageCharWidth;
if (previousAveCharWidth < 0)
{
averageCharWidth = positionWidth / wordCharCount;
}
else
{
averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
}
float deltaCharWidth = averageCharWidth * getAverageCharTolerance();
// Compares the values obtained by the average method and the wordSpacing method
// and picks the smaller number.
float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
if (Float.compare(endOfLastTextX, END_OF_LAST_TEXT_X_RESET_VALUE) != 0)
{
expectedStartOfNextWordX = endOfLastTextX + Math.min(deltaSpace, deltaCharWidth);
}
if (lastPosition != null)
{
if (startOfArticle)
{
lastPosition.setArticleStart();
startOfArticle = false;
}
// RDD - Here we determine whether this text object is on the current
// line. We use the lastBaselineFontSize to handle the superscript
// case, and the size of the current font to handle the subscript case.
// Text must overlap with the last rendered baseline text by at least
// a small amount in order to be considered as being on the same line.
// XXX BC: In theory, this check should really check if the next char is in
// full range seen in this line. This is what I tried to do with minYTopForLine,
// but this caused a lot of regression test failures. So, I'm leaving it be for
// now
if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
{
writeLine(normalize(line));
line.clear();
lastLineStartPosition = handleLineSeparation(current, lastPosition,
lastLineStartPosition, maxHeightForLine);
expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
}
// test if our TextPosition starts after a new word would be expected to start
if (Float.compare(expectedStartOfNextWordX, EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE) != 0
&& expectedStartOfNextWordX < positionX
// only bother adding a word separator if the last character was not a word separator
&& (wordSeparator.isEmpty() || //
(lastPosition.getTextPosition().getUnicode() != null
&& !lastPosition.getTextPosition().getUnicode()
.endsWith(wordSeparator))))
{
line.add(LineItem.getWordSeparator());
}
// if there is at least the equivalent of one space
// between the last character and the current one,
// reset the max line height as the font size may have completely changed.
if (Math.abs(position.getX()
- lastPosition.getTextPosition().getX()) > (wordSpacing + deltaSpace))
{
maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
}
}
if (positionY >= maxYForLine)
{
maxYForLine = positionY;
}
// RDD - endX is what PDF considers to be the x coordinate of the
// end position of the text. We use it in computing our metrics below.
endOfLastTextX = positionX + positionWidth;
// add it to the list
if (characterValue != null)
{
if (startOfPage && lastPosition == null)
{
writeParagraphStart();// not sure this is correct for RTL?
}
line.add(new LineItem(position));
}
maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
lastPosition = current;
if (startOfPage)
{
lastPosition.setParagraphStart();
lastPosition.setLineStart();
lastLineStartPosition = lastPosition;
startOfPage = false;
}
lastWordSpacing = wordSpacing;
previousAveCharWidth = averageCharWidth;
}
// print the final line
if (!line.isEmpty())
{
writeLine(normalize(line));
writeParagraphEnd();
}
endArticle();
}
writePageEnd();
}