protected void writePage()

in pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java [494:727]


    protected void writePage() throws IOException
    {
        float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
        float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
        float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
        float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE;
        float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
        PositionWrapper lastPosition = null;
        PositionWrapper lastLineStartPosition = null;

        boolean startOfPage = true; // flag to indicate start of page
        boolean startOfArticle;
        if (!charactersByArticle.isEmpty())
        {
            writePageStart();
        }

        for (List<TextPosition> textList : charactersByArticle)
        {
            if (getSortByPosition())
            {
                TextPositionComparator comparator = new TextPositionComparator();

                // because the TextPositionComparator is not transitive, but
                // JDK7+ enforces transitivity on comparators, we need to use
                // a custom mergesort implementation (which is slower, unfortunately).
                try
                {
                    textList.sort(comparator);
                }
                catch (IllegalArgumentException e)
                {
                    IterativeMergeSort.sort(textList, comparator);
                }
                // PDFBOX-5487: Remove all space characters if contained within the adjacent letters
                removeContainedSpaces(textList);
            }

            startArticle();
            startOfArticle = true;

            // Now cycle through to print the text.
            // We queue up a line at a time before we print so that we can convert
            // the line from presentation form to logical form (if needed).
            List<LineItem> line = new ArrayList<>();

            Iterator<TextPosition> textIter = textList.iterator();
            // PDF files don't always store spaces. We will need to guess where we should add
            // spaces based on the distances between TextPositions. Historically, this was done
            // based on the size of the space character provided by the font. In general, this
            // worked but there were cases where it did not work. Calculating the average character
            // width and using that as a metric works better in some cases but fails in some cases
            // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
            // these examples.

            // Keeps track of the previous average character width
            float previousAveCharWidth = -1;
            while (textIter.hasNext())
            {
                TextPosition position = textIter.next();
                PositionWrapper current = new PositionWrapper(position);
                String characterValue = position.getUnicode();

                // PDFBOX-3774: conditionally ignore spaces from the content stream
                if (" ".equals(characterValue) && getIgnoreContentStreamSpaceGlyphs())
                {
                    continue;
                }

                // Resets the average character width when we see a change in font
                // or a change in the font size
                if (lastPosition != null
                        && hasFontOrSizeChanged(position, lastPosition.getTextPosition()))
                {
                    previousAveCharWidth = -1;
                }
                float positionX;
                float positionY;
                float positionWidth;
                float positionHeight;

                // If we are sorting, then we need to use the text direction
                // adjusted coordinates, because they were used in the sorting.
                if (getSortByPosition())
                {
                    positionX = position.getXDirAdj();
                    positionY = position.getYDirAdj();
                    positionWidth = position.getWidthDirAdj();
                    positionHeight = position.getHeightDir();
                }
                else
                {
                    positionX = position.getX();
                    positionY = position.getY();
                    positionWidth = position.getWidth();
                    positionHeight = position.getHeight();
                }

                // The current amount of characters in a word
                int wordCharCount = position.getIndividualWidths().length;

                // Estimate the expected width of the space based on the
                // space character with some margin.
                float wordSpacing = position.getWidthOfSpace();
                float deltaSpace;
                if (Float.compare(wordSpacing, 0) == 0 || Float.isNaN(wordSpacing))
                {
                    deltaSpace = Float.MAX_VALUE;
                }
                else
                {
                    if (lastWordSpacing < 0)
                    {
                        deltaSpace = wordSpacing * getSpacingTolerance();
                    }
                    else
                    {
                        deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
                    }
                }

                // Estimate the expected width of the space based on the average character width
                // with some margin. This calculation does not make a true average (average of
                // averages) but we found that it gave the best results after numerous experiments.
                // Based on experiments we also found that .3 worked well.
                float averageCharWidth;
                if (previousAveCharWidth < 0)
                {
                    averageCharWidth = positionWidth / wordCharCount;
                }
                else
                {
                    averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
                }
                float deltaCharWidth = averageCharWidth * getAverageCharTolerance();

                // Compares the values obtained by the average method and the wordSpacing method
                // and picks the smaller number.
                float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
                if (Float.compare(endOfLastTextX, END_OF_LAST_TEXT_X_RESET_VALUE) != 0)
                {
                    expectedStartOfNextWordX = endOfLastTextX + Math.min(deltaSpace, deltaCharWidth);
                }

                if (lastPosition != null)
                {
                    if (startOfArticle)
                    {
                        lastPosition.setArticleStart();
                        startOfArticle = false;
                    }
                    // RDD - Here we determine whether this text object is on the current
                    // line. We use the lastBaselineFontSize to handle the superscript
                    // case, and the size of the current font to handle the subscript case.
                    // Text must overlap with the last rendered baseline text by at least
                    // a small amount in order to be considered as being on the same line.

                    // XXX BC: In theory, this check should really check if the next char is in
                    // full range seen in this line. This is what I tried to do with minYTopForLine,
                    // but this caused a lot of regression test failures. So, I'm leaving it be for
                    // now
                    if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
                    {
                        writeLine(normalize(line));
                        line.clear();
                        lastLineStartPosition = handleLineSeparation(current, lastPosition,
                                lastLineStartPosition, maxHeightForLine);
                        expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
                        maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
                        maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
                        minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
                    }
                    // test if our TextPosition starts after a new word would be expected to start
                    if (Float.compare(expectedStartOfNextWordX, EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE) != 0
                            && expectedStartOfNextWordX < positionX
                            // only bother adding a word separator if the last character was not a word separator
                            && (wordSeparator.isEmpty() || //
                                    (lastPosition.getTextPosition().getUnicode() != null
                                            && !lastPosition.getTextPosition().getUnicode()
                                                    .endsWith(wordSeparator))))
                    {
                        line.add(LineItem.getWordSeparator());
                    }
                    // if there is at least the equivalent of one space
                    // between the last character and the current one,
                    // reset the max line height as the font size may have completely changed.
                    if (Math.abs(position.getX()
                            - lastPosition.getTextPosition().getX()) > (wordSpacing + deltaSpace))
                    {
                        maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
                        maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
                        minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
                    }
                }
                if (positionY >= maxYForLine)
                {
                    maxYForLine = positionY;
                }
                // RDD - endX is what PDF considers to be the x coordinate of the
                // end position of the text. We use it in computing our metrics below.
                endOfLastTextX = positionX + positionWidth;

                // add it to the list
                if (characterValue != null)
                {
                    if (startOfPage && lastPosition == null)
                    {
                        writeParagraphStart();// not sure this is correct for RTL?
                    }
                    line.add(new LineItem(position));
                }
                maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
                minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
                lastPosition = current;
                if (startOfPage)
                {
                    lastPosition.setParagraphStart();
                    lastPosition.setLineStart();
                    lastLineStartPosition = lastPosition;
                    startOfPage = false;
                }
                lastWordSpacing = wordSpacing;
                previousAveCharWidth = averageCharWidth;
            }
            // print the final line
            if (!line.isEmpty())
            {
                writeLine(normalize(line));
                writeParagraphEnd();
            }
            endArticle();
        }
        writePageEnd();
    }