public Integer call()

in tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java [151:305]


    public Integer call()
    {
        // set file extension
        if (toHTML && toMD)
        {
            SYSERR.println( "You can't set md and html at the same time");
            return 1;
        }
        String ext = toHTML ? ".html" : ".txt";
        ext = toMD ? ".md" : ext;

        if (outfile == null)
        {
            String outPath = FilenameUtils.removeExtension(infile.getAbsolutePath()) + ext;
            outfile = new File(outPath);
        }

        if (toHTML && !STD_ENCODING.equals(encoding))
        {
            encoding = STD_ENCODING;
            SYSOUT.println("The encoding parameter is ignored when writing html output.");
        }

        if (toConsole && encoding != null)
        {
            SYSOUT.println("The encoding parameter is ignored when writing to the console.");
        }

        try (PDDocument document = Loader.loadPDF(infile, password);
                Writer output = createOutputWriter())
        {
            long startTime = startProcessing("Loading PDF " + infile);

            AccessPermission ap = document.getCurrentAccessPermission();
            if( ! ap.canExtractContent() )
            {
                SYSERR.println( "You do not have permission to extract text");
                return 1;
            }
            
            stopProcessing("Time for loading: ", startTime);

            startTime = startProcessing("Starting text extraction");

            if (addFileName)
            {
                output.write("PDF file: " + infile);
                output.write(System.lineSeparator());
            }

            if (debug)
            {
                SYSERR.println("Writing to " + outfile.getAbsolutePath());
            }

            PDFTextStripper stripper;
            if(toHTML)
            {
                // HTML stripper can't work page by page because of startDocument() callback
                stripper = new PDFText2HTML();
                stripper.setSortByPosition(sort);
                stripper.setShouldSeparateByBeads(!ignoreBeads);
                stripper.setStartPage(startPage);
                stripper.setEndPage(endPage);

                // Extract text for main document:
                stripper.writeText(document, output);
            }
            else
            {
                if (toMD)
                {
                    if (rotationMagic)
                    {
                        stripper = new FilteredText2Markdown();
                    }
                    else
                    {
                        stripper = new PDFText2Markdown();
                    }
                }
                else
                {
                    if (rotationMagic)
                    {
                        stripper = new FilteredTextStripper();
                    }
                    else
                    {
                        stripper = new PDFTextStripper();
                    }
                }
                stripper.setSortByPosition(sort);
                stripper.setShouldSeparateByBeads(!ignoreBeads);

                // Extract text for main document:
                extractPages(startPage, Math.min(endPage, document.getNumberOfPages()), 
                             stripper, document, output, rotationMagic, alwaysNext);
            }

            // ... also for any embedded PDFs:
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDDocumentNameDictionary names = catalog.getNames();    
            if (names != null)
            {
                PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                if (embeddedFiles != null)
                {
                    Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
                    if (embeddedFileNames != null)
                    {
                        for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) 
                        {
                            if (debug)
                            {
                                SYSERR.println("Processing embedded file " + ent.getKey() + ":");
                            }
                            PDComplexFileSpecification spec = ent.getValue();
                            PDEmbeddedFile file = spec.getEmbeddedFile();
                            if (file != null && "application/pdf".equals(file.getSubtype()))
                            {
                                if (debug)
                                {
                                    SYSERR.println("  is PDF (size=" + file.getSize() + ")");
                                }
                                try (PDDocument subDoc = Loader.loadPDF(RandomAccessReadBuffer
                                                .createBufferFromStream(file.createInputStream())))
                                {
                                    if (toHTML)
                                    {
                                        // will not really work because of HTML header + footer
                                        stripper.writeText( subDoc, output );
                                    }
                                    else
                                    {
                                        extractPages(1, subDoc.getNumberOfPages(),
                                                     stripper, subDoc, output, rotationMagic, alwaysNext);
                                    }
                                } 
                            }
                        } 
                    }
                }
            }
            output.flush();
            stopProcessing("Time for extraction: ", startTime);
        }
        catch (IOException ioe)
        {
            SYSERR.println( "Error extracting text for document [" + ioe.getClass().getSimpleName() + "]: " + ioe.getMessage());
            return 4;
        }

        return 0;
    }