in community/document-parsers/spring-ai-alibaba-starter-document-parser-apache-pdfbox/src/main/java/com/alibaba/cloud/ai/parser/apache/pdfbox/PagePdfDocumentParser.java [67:141]
public List<Document> parse(InputStream inputStream) {
List<Document> readDocuments = new ArrayList<>();
try {
var pdfTextStripper = new PDFLayoutTextStripperByArea();
int pageNumber = 0;
int pagesPerDocument = 0;
int startPageNumber = pageNumber;
List<String> pageTextGroupList = new ArrayList<>();
PDFParser pdfParser = new PDFParser(new org.apache.pdfbox.io.RandomAccessReadBuffer(inputStream));
PDDocument document = pdfParser.parse();
int totalPages = document.getDocumentCatalog().getPages().getCount();
// if less than 10
int logFrequency = totalPages > 10 ? totalPages / 10 : 1;
// pages, print
// each iteration
int counter = 0;
PDPage lastPage = document.getDocumentCatalog().getPages().iterator().next();
for (PDPage page : document.getDocumentCatalog().getPages()) {
lastPage = page;
if (counter % logFrequency == 0 && counter / logFrequency < 10) {
logger.info("Processing PDF page: {}", (counter + 1));
}
counter++;
pagesPerDocument++;
if (this.config.pagesPerDocument != PdfDocumentReaderConfig.ALL_PAGES
&& pagesPerDocument >= this.config.pagesPerDocument) {
pagesPerDocument = 0;
var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining());
if (StringUtils.hasText(aggregatedPageTextGroup)) {
readDocuments.add(toDocument(aggregatedPageTextGroup, startPageNumber, pageNumber));
}
pageTextGroupList.clear();
startPageNumber = pageNumber + 1;
}
int x0 = (int) page.getMediaBox().getLowerLeftX();
int xW = (int) page.getMediaBox().getWidth();
int y0 = (int) page.getMediaBox().getLowerLeftY() + this.config.pageTopMargin;
int yW = (int) page.getMediaBox().getHeight()
- (this.config.pageTopMargin + this.config.pageBottomMargin);
pdfTextStripper.addRegion(PDF_PAGE_REGION, new Rectangle(x0, y0, xW, yW));
pdfTextStripper.extractRegions(page);
var pageText = pdfTextStripper.getTextForRegion(PDF_PAGE_REGION);
if (StringUtils.hasText(pageText)) {
pageText = this.config.pageExtractedTextFormatter.format(pageText, pageNumber);
pageTextGroupList.add(pageText);
}
pageNumber++;
pdfTextStripper.removeRegion(PDF_PAGE_REGION);
}
if (!CollectionUtils.isEmpty(pageTextGroupList)) {
readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()), startPageNumber,
pageNumber));
}
logger.info("Processing {} pages", totalPages);
return readDocuments;
}
catch (IOException e) {
throw new RuntimeException(e);
}
}