in tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java [151:305]
public Integer call()
{
// set file extension
if (toHTML && toMD)
{
SYSERR.println( "You can't set md and html at the same time");
return 1;
}
String ext = toHTML ? ".html" : ".txt";
ext = toMD ? ".md" : ext;
if (outfile == null)
{
String outPath = FilenameUtils.removeExtension(infile.getAbsolutePath()) + ext;
outfile = new File(outPath);
}
if (toHTML && !STD_ENCODING.equals(encoding))
{
encoding = STD_ENCODING;
SYSOUT.println("The encoding parameter is ignored when writing html output.");
}
if (toConsole && encoding != null)
{
SYSOUT.println("The encoding parameter is ignored when writing to the console.");
}
try (PDDocument document = Loader.loadPDF(infile, password);
Writer output = createOutputWriter())
{
long startTime = startProcessing("Loading PDF " + infile);
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
SYSERR.println( "You do not have permission to extract text");
return 1;
}
stopProcessing("Time for loading: ", startTime);
startTime = startProcessing("Starting text extraction");
if (addFileName)
{
output.write("PDF file: " + infile);
output.write(System.lineSeparator());
}
if (debug)
{
SYSERR.println("Writing to " + outfile.getAbsolutePath());
}
PDFTextStripper stripper;
if(toHTML)
{
// HTML stripper can't work page by page because of startDocument() callback
stripper = new PDFText2HTML();
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(!ignoreBeads);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
// Extract text for main document:
stripper.writeText(document, output);
}
else
{
if (toMD)
{
if (rotationMagic)
{
stripper = new FilteredText2Markdown();
}
else
{
stripper = new PDFText2Markdown();
}
}
else
{
if (rotationMagic)
{
stripper = new FilteredTextStripper();
}
else
{
stripper = new PDFTextStripper();
}
}
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(!ignoreBeads);
// Extract text for main document:
extractPages(startPage, Math.min(endPage, document.getNumberOfPages()),
stripper, document, output, rotationMagic, alwaysNext);
}
// ... also for any embedded PDFs:
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
if (names != null)
{
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
if (embeddedFiles != null)
{
Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
if (embeddedFileNames != null)
{
for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet())
{
if (debug)
{
SYSERR.println("Processing embedded file " + ent.getKey() + ":");
}
PDComplexFileSpecification spec = ent.getValue();
PDEmbeddedFile file = spec.getEmbeddedFile();
if (file != null && "application/pdf".equals(file.getSubtype()))
{
if (debug)
{
SYSERR.println(" is PDF (size=" + file.getSize() + ")");
}
try (PDDocument subDoc = Loader.loadPDF(RandomAccessReadBuffer
.createBufferFromStream(file.createInputStream())))
{
if (toHTML)
{
// will not really work because of HTML header + footer
stripper.writeText( subDoc, output );
}
else
{
extractPages(1, subDoc.getNumberOfPages(),
stripper, subDoc, output, rotationMagic, alwaysNext);
}
}
}
}
}
}
}
output.flush();
stopProcessing("Time for extraction: ", startTime);
}
catch (IOException ioe)
{
SYSERR.println( "Error extracting text for document [" + ioe.getClass().getSimpleName() + "]: " + ioe.getMessage());
return 4;
}
return 0;
}