in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java [143:248]
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
PDFParserConfig localConfig = defaultConfig;
PDFParserConfig userConfig = context.get(PDFParserConfig.class);
if (userConfig != null) {
localConfig = defaultConfig.cloneAndUpdate(userConfig);
}
if (localConfig.isSetKCMS()) {
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
}
IncrementalUpdateRecord incomingIncrementalUpdateRecord = context.get(IncrementalUpdateRecord.class);
context.set(IncrementalUpdateRecord.class, null);
initRenderer(localConfig, context);
PDDocument pdfDocument = null;
String password = "";
PDFRenderingState incomingRenderingState = context.get(PDFRenderingState.class);
TikaInputStream tstream = null;
boolean shouldClose = false;
OCRPageCounter prevOCRCounter = context.get(OCRPageCounter.class);
context.set(OCRPageCounter.class, new OCRPageCounter());
try {
if (shouldSpool(localConfig)) {
if (stream instanceof TikaInputStream) {
tstream = (TikaInputStream) stream;
} else {
tstream = TikaInputStream.get(CloseShieldInputStream.wrap(stream));
shouldClose = true;
}
context.set(PDFRenderingState.class, new PDFRenderingState(tstream));
} else {
tstream = TikaInputStream.cast(stream);
}
scanXRefOffsets(localConfig, tstream, metadata, context);
password = getPassword(metadata, context);
MemoryUsageSetting memoryUsageSetting = null;
if (localConfig.getMaxMainMemoryBytes() >= 0) {
memoryUsageSetting =
MemoryUsageSetting.setupMixed(localConfig.getMaxMainMemoryBytes());
} else {
memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
}
pdfDocument = getPDDocument(stream, tstream, password,
memoryUsageSetting.streamCache, metadata, context);
boolean hasCollection = hasCollection(pdfDocument, metadata);
checkEncryptedPayload(pdfDocument, hasCollection, localConfig);
boolean hasXFA = hasXFA(pdfDocument, metadata);
boolean hasMarkedContent = hasMarkedContent(pdfDocument, metadata);
extractMetadata(pdfDocument, metadata, context);
extractSignatures(pdfDocument, metadata);
checkIllustrator(pdfDocument, metadata);
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
renderPagesBeforeParse(tstream, handler, metadata, context, localConfig);
if (handler != null) {
if (shouldHandleXFAOnly(hasXFA, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
} else if (localConfig.getOcrStrategy()
.equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
OCR2XHTML.process(pdfDocument, handler, context, metadata,
localConfig);
} else if (hasMarkedContent && localConfig.isExtractMarkedContent()) {
PDFMarkedContent2XHTML
.process(pdfDocument, handler, context, metadata,
localConfig);
} else {
PDF2XHTML.process(pdfDocument, handler, context, metadata,
localConfig);
}
}
} catch (InvalidPasswordException e) {
metadata.set(PDF.IS_ENCRYPTED, "true");
throw new EncryptedDocumentException(e);
} finally {
metadata.set(OCR_PAGE_COUNT, context.get(OCRPageCounter.class).getCount());
context.set(OCRPageCounter.class, prevOCRCounter);
//reset the incrementalUpdateRecord even if null
context.set(IncrementalUpdateRecord.class, incomingIncrementalUpdateRecord);
PDFRenderingState currState = context.get(PDFRenderingState.class);
try {
if (currState != null && currState.getRenderResults() != null) {
currState.getRenderResults().close();
}
if (pdfDocument != null) {
pdfDocument.close();
}
} finally {
//replace the one that was here
context.set(PDFRenderingState.class, incomingRenderingState);
if (shouldClose && tstream != null) {
tstream.close();
}
}
}
}