in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java [560:680]
private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context)
throws TikaException {
metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
//first extract AccessPermissions
AccessPermission ap = document.getCurrentAccessPermission();
metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
Boolean.toString(ap.canExtractForAccessibility()));
metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT,
Boolean.toString(ap.canAssembleDocument()));
metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS,
Boolean.toString(ap.canModifyAnnotations()));
metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
metadata.set(AccessPermissions.CAN_PRINT_FAITHFUL,
Boolean.toString(ap.canPrintFaithful()));
metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(document.isEncrypted()));
if (document.getDocumentCatalog().getLanguage() != null) {
metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
}
// TIKA-3246: Do this for the first call of getAcroForm(),
// subsequent calls should use the same fixup or null to avoid a default fixup.
// Do not call without parameters (would mean default fixup which is slower because
// it creates annotation appearances)
PDDocumentFixup fixup = new TikaAcroFormFixup(document);
PDAcroForm acroForm = document.getDocumentCatalog().getAcroForm(fixup);
if (acroForm != null && acroForm.getFields() != null && !acroForm.getFields().isEmpty()) {
metadata.set(PDF.HAS_ACROFORM_FIELDS, "true");
}
PDMetadataExtractor.extract(document.getDocumentCatalog().getMetadata(), metadata, context);
PDDocumentInformation info = document.getDocumentInformation();
metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
//if this wasn't already set by xmp, use doc info
if (metadata.get(TikaCoreProperties.CREATOR) == null) {
PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
}
if (metadata.get(TikaCoreProperties.TITLE) == null) {
PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
}
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
PDMetadataExtractor
.addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
PDMetadataExtractor.addMetadata(metadata, PDF.PRODUCER, info.getProducer());
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());
PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getKeywords());
PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getSubject());
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
Calendar created = info.getCreationDate();
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATED, created);
PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATED, created);
Calendar modified = info.getModificationDate();
PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified);
// All remaining metadata is custom
// Copy this over as-is
List<String> handledMetadata =
Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
"Producer", "Subject", "Title", "Trapped");
for (COSName key : info.getCOSObject().keySet()) {
String name = key.getName();
if (!handledMetadata.contains(name)) {
PDMetadataExtractor.addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
info.getCOSObject().getDictionaryObject(key));
}
}
//try to get the various versions
//Caveats:
// there is currently a fair amount of redundancy
// TikaCoreProperties.FORMAT can be multivalued
// There are also three potential pdf specific version keys:
// pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion()));
metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" +
Float.toString(document.getDocument().getVersion()));
//TODO: Let's try to move this into PDFBox.
//Attempt to determine Adobe extension level, if present:
COSDictionary root = document.getDocumentCatalog().getCOSObject();
COSDictionary extensions =
(COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
if (extensions != null) {
for (COSName extName : extensions.keySet()) {
// If it's an Adobe one, interpret it to determine the extension level:
if (extName.equals(COSName.getPDFName("ADBE"))) {
COSDictionary adobeExt =
(COSDictionary) extensions.getDictionaryObject(extName);
if (adobeExt != null) {
String baseVersion =
adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
//-1 is sentinel value that something went wrong in getInt
if (el != -1) {
metadata.set(PDF.PDF_EXTENSION_VERSION,
baseVersion + " Adobe Extension Level " + el);
metadata.add(TikaCoreProperties.FORMAT.getName(),
MEDIA_TYPE.toString() + "; version=\"" + baseVersion +
" Adobe Extension Level " + el + "\"");
}
}
} else {
// WARN that there is an Extension, but it's not Adobe's, and so is a 'new'
// format'.
metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
}
}
}
}