in pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java [897:1055]
protected void processTextPosition(TextPosition text)
{
if (actualText != null)
{
if (firstActualTextPosition)
{
text.setUnicode(actualText);
firstActualTextPosition = false;
}
else
{
text.setUnicode("");
}
}
boolean showCharacter = true;
if (suppressDuplicateOverlappingText && actualText == null)
{
showCharacter = false;
String textCharacter = text.getUnicode();
float textX = text.getX();
float textY = text.getY();
TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping
.computeIfAbsent(textCharacter, k -> new TreeMap<>());
// RDD - Here we compute the value that represents the end of the rendered
// text. This value is used to determine whether subsequent text rendered
// on the same line overwrites the current text.
//
// We subtract any positive padding to handle cases where extreme amounts
// of padding are applied, then backed off (not sure why this is done, but there
// are cases where the padding is on the order of 10x the character width, and
// the TJ just backs up to compensate after each character). Also, we subtract
// an amount to allow for kerning (a percentage of the width of the last
// character).
boolean suppressCharacter = false;
float tolerance = text.getWidth() / textCharacter.length() / 3.0f;
SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance,
textX + tolerance);
for (TreeSet<Float> xMatch : xMatches.values())
{
SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
if (!yMatches.isEmpty())
{
suppressCharacter = true;
break;
}
}
if (!suppressCharacter)
{
TreeSet<Float> ySet = sameTextCharacters.computeIfAbsent(textX, k -> new TreeSet<>());
ySet.add(textY);
showCharacter = true;
}
}
if (showCharacter)
{
// if we are showing the character then we need to determine which article it belongs to
int foundArticleDivisionIndex = -1;
int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
int notFoundButFirstLeftArticleDivisionIndex = -1;
int notFoundButFirstAboveArticleDivisionIndex = -1;
float x = text.getX();
float y = text.getY();
if (shouldSeparateByBeads)
{
for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++)
{
PDRectangle rect = beadRectangles.get(i);
if (rect != null)
{
if (rect.contains(x, y))
{
foundArticleDivisionIndex = i * 2 + 1;
}
else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
&& notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
{
notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
}
else if (x < rect.getLowerLeftX()
&& notFoundButFirstLeftArticleDivisionIndex == -1)
{
notFoundButFirstLeftArticleDivisionIndex = i * 2;
}
else if (y < rect.getUpperRightY()
&& notFoundButFirstAboveArticleDivisionIndex == -1)
{
notFoundButFirstAboveArticleDivisionIndex = i * 2;
}
}
else
{
foundArticleDivisionIndex = 0;
}
}
}
else
{
foundArticleDivisionIndex = 0;
}
int articleDivisionIndex;
if (foundArticleDivisionIndex != -1)
{
articleDivisionIndex = foundArticleDivisionIndex;
}
else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1)
{
articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
}
else if (notFoundButFirstLeftArticleDivisionIndex != -1)
{
articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
}
else if (notFoundButFirstAboveArticleDivisionIndex != -1)
{
articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
}
else
{
articleDivisionIndex = charactersByArticle.size() - 1;
}
List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);
// In the wild, some PDF encoded documents put diacritics (accents on
// top of characters) into a separate Tj element. When displaying them
// graphically, the two chunks get overlaid. With text output though,
// we need to do the overlay. This code recombines the diacritic with
// its associated character if the two are consecutive.
if (textList.isEmpty())
{
textList.add(text);
}
else
{
// test if we overlap the previous entry.
// Note that we are making an assumption that we need to only look back
// one TextPosition to find what we are overlapping.
// This may not always be true. */
TextPosition previousTextPosition = textList.get(textList.size() - 1);
if (text.isDiacritic() && previousTextPosition.contains(text))
{
previousTextPosition.mergeDiacritic(text);
}
// If the previous TextPosition was the diacritic, merge it into this
// one and remove it from the list.
else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
{
text.mergeDiacritic(previousTextPosition);
textList.remove(textList.size() - 1);
textList.add(text);
}
else
{
textList.add(text);
}
}
}
}