public List parse()

in community/document-parsers/spring-ai-alibaba-starter-document-parser-apache-pdfbox/src/main/java/com/alibaba/cloud/ai/parser/apache/pdfbox/PagePdfDocumentParser.java [67:141]


	public List<Document> parse(InputStream inputStream) {

		List<Document> readDocuments = new ArrayList<>();
		try {
			var pdfTextStripper = new PDFLayoutTextStripperByArea();

			int pageNumber = 0;
			int pagesPerDocument = 0;
			int startPageNumber = pageNumber;

			List<String> pageTextGroupList = new ArrayList<>();
			PDFParser pdfParser = new PDFParser(new org.apache.pdfbox.io.RandomAccessReadBuffer(inputStream));
			PDDocument document = pdfParser.parse();

			int totalPages = document.getDocumentCatalog().getPages().getCount();
			// if less than 10
			int logFrequency = totalPages > 10 ? totalPages / 10 : 1;
			// pages, print
			// each iteration
			int counter = 0;

			PDPage lastPage = document.getDocumentCatalog().getPages().iterator().next();
			for (PDPage page : document.getDocumentCatalog().getPages()) {
				lastPage = page;
				if (counter % logFrequency == 0 && counter / logFrequency < 10) {
					logger.info("Processing PDF page: {}", (counter + 1));
				}
				counter++;

				pagesPerDocument++;

				if (this.config.pagesPerDocument != PdfDocumentReaderConfig.ALL_PAGES
						&& pagesPerDocument >= this.config.pagesPerDocument) {
					pagesPerDocument = 0;

					var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining());
					if (StringUtils.hasText(aggregatedPageTextGroup)) {
						readDocuments.add(toDocument(aggregatedPageTextGroup, startPageNumber, pageNumber));
					}
					pageTextGroupList.clear();

					startPageNumber = pageNumber + 1;
				}
				int x0 = (int) page.getMediaBox().getLowerLeftX();
				int xW = (int) page.getMediaBox().getWidth();

				int y0 = (int) page.getMediaBox().getLowerLeftY() + this.config.pageTopMargin;
				int yW = (int) page.getMediaBox().getHeight()
						- (this.config.pageTopMargin + this.config.pageBottomMargin);

				pdfTextStripper.addRegion(PDF_PAGE_REGION, new Rectangle(x0, y0, xW, yW));
				pdfTextStripper.extractRegions(page);
				var pageText = pdfTextStripper.getTextForRegion(PDF_PAGE_REGION);

				if (StringUtils.hasText(pageText)) {

					pageText = this.config.pageExtractedTextFormatter.format(pageText, pageNumber);

					pageTextGroupList.add(pageText);
				}
				pageNumber++;
				pdfTextStripper.removeRegion(PDF_PAGE_REGION);
			}
			if (!CollectionUtils.isEmpty(pageTextGroupList)) {
				readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()), startPageNumber,
						pageNumber));
			}
			logger.info("Processing {} pages", totalPages);
			return readDocuments;

		}
		catch (IOException e) {
			throw new RuntimeException(e);
		}
	}