private void extractMetadata()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java [560:680]


    private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context)
            throws TikaException {
        metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());

        //first extract AccessPermissions
        AccessPermission ap = document.getCurrentAccessPermission();
        metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
                Boolean.toString(ap.canExtractForAccessibility()));
        metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
        metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT,
                Boolean.toString(ap.canAssembleDocument()));
        metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
        metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
        metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS,
                Boolean.toString(ap.canModifyAnnotations()));
        metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
        metadata.set(AccessPermissions.CAN_PRINT_FAITHFUL,
                Boolean.toString(ap.canPrintFaithful()));
        metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(document.isEncrypted()));

        if (document.getDocumentCatalog().getLanguage() != null) {
            metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
        }
        // TIKA-3246: Do this for the first call of getAcroForm(),
        // subsequent calls should use the same fixup or null to avoid a default fixup.
        // Do not call without parameters (would mean default fixup which is slower because
        // it creates annotation appearances)
        PDDocumentFixup fixup = new TikaAcroFormFixup(document);
        PDAcroForm acroForm = document.getDocumentCatalog().getAcroForm(fixup);
        if (acroForm != null && acroForm.getFields() != null && !acroForm.getFields().isEmpty()) {
            metadata.set(PDF.HAS_ACROFORM_FIELDS, "true");
        }
        PDMetadataExtractor.extract(document.getDocumentCatalog().getMetadata(), metadata, context);

        PDDocumentInformation info = document.getDocumentInformation();
        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
        //if this wasn't already set by xmp, use doc info
        if (metadata.get(TikaCoreProperties.CREATOR) == null) {
            PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
        }
        if (metadata.get(TikaCoreProperties.TITLE) == null) {
            PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
        }
        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
        PDMetadataExtractor
                .addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
        PDMetadataExtractor.addMetadata(metadata, PDF.PRODUCER, info.getProducer());

        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());

        PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getKeywords());
        PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getSubject());

        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
        Calendar created = info.getCreationDate();
        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATED, created);
        PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATED, created);
        Calendar modified = info.getModificationDate();
        PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified);

        // All remaining metadata is custom
        // Copy this over as-is
        List<String> handledMetadata =
                Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
                        "Producer", "Subject", "Title", "Trapped");
        for (COSName key : info.getCOSObject().keySet()) {
            String name = key.getName();
            if (!handledMetadata.contains(name)) {
                PDMetadataExtractor.addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
                        info.getCOSObject().getDictionaryObject(key));
            }
        }

        //try to get the various versions
        //Caveats:
        //    there is currently a fair amount of redundancy
        //    TikaCoreProperties.FORMAT can be multivalued
        //    There are also three potential pdf specific version keys:
        //    pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
        metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion()));
        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" +
                Float.toString(document.getDocument().getVersion()));


        //TODO: Let's try to move this into PDFBox.
        //Attempt to determine Adobe extension level, if present:
        COSDictionary root = document.getDocumentCatalog().getCOSObject();
        COSDictionary extensions =
                (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
        if (extensions != null) {
            for (COSName extName : extensions.keySet()) {
                // If it's an Adobe one, interpret it to determine the extension level:
                if (extName.equals(COSName.getPDFName("ADBE"))) {
                    COSDictionary adobeExt =
                            (COSDictionary) extensions.getDictionaryObject(extName);
                    if (adobeExt != null) {
                        String baseVersion =
                                adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                        int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                        //-1 is sentinel value that something went wrong in getInt
                        if (el != -1) {
                            metadata.set(PDF.PDF_EXTENSION_VERSION,
                                    baseVersion + " Adobe Extension Level " + el);
                            metadata.add(TikaCoreProperties.FORMAT.getName(),
                                    MEDIA_TYPE.toString() + "; version=\"" + baseVersion +
                                            " Adobe Extension Level " + el + "\"");
                        }
                    }
                } else {
                    // WARN that there is an Extension, but it's not Adobe's, and so is a 'new'
                    // format'.
                    metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
                }
            }
        }
    }