in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java [696:836]
protected void endPage(PDPage page) throws IOException {
metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE, unmappedUnicodeCharsPerPage);
try {
for (PDAnnotation annotation : page.getAnnotations()) {
String annotationName = annotation.getAnnotationName();
if (annotationTypes.size() < MAX_ANNOTATION_TYPES) {
if (annotationName != null) {
annotationTypes.add(annotationName);
} else {
annotationTypes.add(NULL_STRING);
}
}
String annotationSubtype = annotation.getSubtype();
if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) {
if (annotationSubtype != null) {
annotationSubtypes.add(annotationSubtype);
} else {
annotationSubtypes.add(NULL_STRING);
}
}
if (annotation instanceof PDAnnotationFileAttachment) {
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
String subtype = "annotationFileAttachment";
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "source", "source", "CDATA", subtype);
processDocOnAction("", subtype, fann.getFile(), attributes);
} else if (annotation instanceof PDAnnotationWidget) {
handleWidget((PDAnnotationWidget) annotation);
} else {
if (annotationSubtype == null) {
annotationSubtype = "unknown";
} else if (annotationSubtype.equals(THREE_D) ||
annotation.getCOSObject().containsKey(THREE_DD)) {
//To make this stricter, we could get the 3DD stream object and see if the
//subtype is U3D or PRC or model/ (prefix for model mime type)
metadata.set(PDF.HAS_3D, true);
num3DAnnotations++;
}
for (COSDictionary fileSpec : findFileSpecs(annotation.getCOSObject())) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "source", "source", "CDATA", annotationSubtype);
processDocOnAction("", annotationSubtype, createFileSpecification(fileSpec),
attributes);
}
}
// TODO: remove once PDFBOX-1143 is fixed:
if (config.isExtractAnnotationText()) {
PDActionURI uri = getActionURI(annotation);
if (uri != null) {
String link = uri.getURI();
if (link != null && !link.isBlank()) {
xhtml.startElement("div", "class", "annotation");
xhtml.startElement("a", "href", link);
xhtml.characters(link);
xhtml.endElement("a");
xhtml.endElement("div");
}
}
if (annotation instanceof PDAnnotationMarkup) {
PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
String title = annotationMarkup.getTitlePopup();
String subject = annotationMarkup.getSubject();
String contents = annotationMarkup.getContents();
// TODO: maybe also annotationMarkup.getRichContents()?
if (title != null || subject != null || contents != null) {
xhtml.startElement("div", "class", "annotation");
if (title != null) {
xhtml.startElement("div", "class", "annotationTitle");
xhtml.characters(title);
xhtml.endElement("div");
}
if (subject != null) {
xhtml.startElement("div", "class", "annotationSubject");
xhtml.characters(subject);
xhtml.endElement("div");
}
if (contents != null) {
xhtml.startElement("div", "class", "annotationContents");
xhtml.characters(contents);
xhtml.endElement("div");
}
xhtml.endElement("div");
}
}
}
}
if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) {
doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION);
} else if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.AUTO) {
boolean unmappedExceedsLimit = false;
if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) {
// There are enough characters to not have to do OCR. Check number of unmapped characters
final float percentUnmapped =
(float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
final float unmappedCharacterLimit =
config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
unmappedExceedsLimit = (unmappedCharacterLimit < 1) ?
percentUnmapped > unmappedCharacterLimit :
unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
}
if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() ||
unmappedExceedsLimit) {
doOCROnCurrentPage(page, AUTO);
}
}
PDPageAdditionalActions pageActions = page.getActions();
if (pageActions != null) {
handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
}
xhtml.endElement("div");
} catch (SAXException | TikaException e) {
throw new IOException("Unable to end a page", e);
} catch (IOException e) {
handleCatchableIOE(e);
} finally {
totalCharsPerPage = 0;
unmappedUnicodeCharsPerPage = 0;
}
if (config.isExtractFontNames()) {
for (COSName n : page.getResources().getFontNames()) {
PDFont font = page.getResources().getFont(n);
if (font != null && font.getFontDescriptor() != null) {
String fontName = font.getFontDescriptor().getFontName();
if (fontName != null) {
fontNames.add(fontName);
}
}
}
}
}