protected void endPage()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java [696:836]


    protected void endPage(PDPage page) throws IOException {
        metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
        metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE, unmappedUnicodeCharsPerPage);


        try {
            for (PDAnnotation annotation : page.getAnnotations()) {
                String annotationName = annotation.getAnnotationName();
                if (annotationTypes.size() < MAX_ANNOTATION_TYPES) {
                    if (annotationName != null) {
                        annotationTypes.add(annotationName);
                    } else {
                        annotationTypes.add(NULL_STRING);
                    }
                }
                String annotationSubtype = annotation.getSubtype();
                if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) {
                    if (annotationSubtype != null) {
                        annotationSubtypes.add(annotationSubtype);
                    } else {
                        annotationSubtypes.add(NULL_STRING);
                    }
                }
                if (annotation instanceof PDAnnotationFileAttachment) {
                    PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                    String subtype = "annotationFileAttachment";
                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute("", "source", "source", "CDATA", subtype);
                    processDocOnAction("", subtype, fann.getFile(), attributes);
                } else if (annotation instanceof PDAnnotationWidget) {
                    handleWidget((PDAnnotationWidget) annotation);
                } else {
                    if (annotationSubtype == null) {
                        annotationSubtype = "unknown";
                    } else if (annotationSubtype.equals(THREE_D) ||
                            annotation.getCOSObject().containsKey(THREE_DD)) {
                        //To make this stricter, we could get the 3DD stream object and see if the
                        //subtype is U3D or PRC or model/ (prefix for model mime type)
                        metadata.set(PDF.HAS_3D, true);
                        num3DAnnotations++;
                    }
                    for (COSDictionary fileSpec : findFileSpecs(annotation.getCOSObject())) {
                        AttributesImpl attributes = new AttributesImpl();
                        attributes.addAttribute("", "source", "source", "CDATA", annotationSubtype);
                        processDocOnAction("", annotationSubtype, createFileSpecification(fileSpec),
                                attributes);
                    }
                }
                // TODO: remove once PDFBOX-1143 is fixed:
                if (config.isExtractAnnotationText()) {
                    PDActionURI uri = getActionURI(annotation);
                    if (uri != null) {
                        String link = uri.getURI();
                        if (link != null && !link.isBlank()) {
                            xhtml.startElement("div", "class", "annotation");
                            xhtml.startElement("a", "href", link);
                            xhtml.characters(link);
                            xhtml.endElement("a");
                            xhtml.endElement("div");
                        }
                    }

                    if (annotation instanceof PDAnnotationMarkup) {
                        PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                        String title = annotationMarkup.getTitlePopup();
                        String subject = annotationMarkup.getSubject();
                        String contents = annotationMarkup.getContents();
                        // TODO: maybe also annotationMarkup.getRichContents()?
                        if (title != null || subject != null || contents != null) {
                            xhtml.startElement("div", "class", "annotation");

                            if (title != null) {
                                xhtml.startElement("div", "class", "annotationTitle");
                                xhtml.characters(title);
                                xhtml.endElement("div");
                            }

                            if (subject != null) {
                                xhtml.startElement("div", "class", "annotationSubject");
                                xhtml.characters(subject);
                                xhtml.endElement("div");
                            }

                            if (contents != null) {
                                xhtml.startElement("div", "class", "annotationContents");
                                xhtml.characters(contents);
                                xhtml.endElement("div");
                            }

                            xhtml.endElement("div");
                        }
                    }
                }
            }
            if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) {
                doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION);
            } else if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.AUTO) {
                boolean unmappedExceedsLimit = false;
                if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) {
                    // There are enough characters to not have to do OCR.  Check number of unmapped characters
                    final float percentUnmapped =
                            (float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
                    final float unmappedCharacterLimit =
                            config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
                    unmappedExceedsLimit = (unmappedCharacterLimit < 1) ?
                            percentUnmapped > unmappedCharacterLimit :
                            unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
                }
                if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() ||
                        unmappedExceedsLimit) {
                    doOCROnCurrentPage(page, AUTO);
                }
            }

            PDPageAdditionalActions pageActions = page.getActions();
            if (pageActions != null) {
                handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
                handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
            }
            xhtml.endElement("div");
        } catch (SAXException | TikaException e) {
            throw new IOException("Unable to end a page", e);
        } catch (IOException e) {
            handleCatchableIOE(e);
        } finally {
            totalCharsPerPage = 0;
            unmappedUnicodeCharsPerPage = 0;
        }

        if (config.isExtractFontNames()) {
            for (COSName n : page.getResources().getFontNames()) {
                PDFont font = page.getResources().getFont(n);
                if (font != null && font.getFontDescriptor() != null) {
                    String fontName = font.getFontDescriptor().getFontName();
                    if (fontName != null) {
                        fontNames.add(fontName);
                    }
                }
            }
        }
    }