core/pdf.py

# Copyright 2025 DeepMind Technologies Limited. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Processor extracting parts from PDF documents (using pypdfium2). Requires pypdfium2 to be installed: ``` pip install pypdfium2 ``` See `pdf_cli.py` for an example of usage and to test this processor with a local PDF file. """ import asyncio from collections.abc import AsyncIterable import io import threading from genai_processors import content_api from genai_processors import processor from PIL import Image import pypdfium2 as pdfium PDF_MIMETYPE = 'application/pdf' def is_pdf(part: content_api.ProcessorPart) -> bool: return part.mimetype == PDF_MIMETYPE def _png_from_pil(img: Image.Image) -> bytes: """Converts PIL Image to PNG bytes.""" img_byte_arr = io.BytesIO() img.convert('RGB').save(img_byte_arr, format='png') return img_byte_arr.getvalue() # PDFium is not thread-safe. _PDFIUM_RENDERER_LOCK = threading.Lock() class PDFExtract(processor.PartProcessor): """Extracts PDFs into content parts. Takes the PDF file bytes and extracts the text and images from it. The images are rendered as PNGs. The text is rendered as UTF-8. The PDF is rendered as a list of ProcessorParts. The PDF is rendered with pypdfium2. Note that the input parts with the PDF file bytes should also have a metadata field with the `original_file_name` key. This is used to generate the status messages. """ def _format( self, file_name: str, pdf_doc: pdfium.PdfDocument ) -> tuple[content_api.ProcessorContent, tuple[str, int, int]]: total_pages_with_images = 0 content = content_api.ProcessorContent() content += f'--- START OF PDF {file_name} ---\n\n' for i, page in enumerate(pdf_doc): # We first find out if there are any images on this page. # FPDF_PAGEOBJ_IMAGE is for images, # FPDF_PAGEOBJ_FORM is for images embedded as PDFs. images = page.get_objects( filter=( pdfium.raw.FPDF_PAGEOBJ_IMAGE, pdfium.raw.FPDF_PAGEOBJ_FORM, pdfium.raw.FPDF_PAGEOBJ_PATH, ), max_depth=0, ) images = list(images) # If there are images on this page, we pass in a screenshot / render of # the page along with the raw text. if images: total_pages_with_images += 1 screenshot_pil = page.render().to_pil() content += f'---- Screenshot for PAGE {i + 1} ----\n\n' content += content_api.ProcessorPart(screenshot_pil) content += f'--- PAGE {i + 1} ----\n\n' all_text_on_page = page.get_textpage().get_text_range() content += all_text_on_page page.close() return content, (file_name, len(pdf_doc), total_pages_with_images) def _process( self, part: content_api.ProcessorPart, ) -> tuple[content_api.ProcessorContent, tuple[str, int, int]]: # pdfium is not thread safe. with _PDFIUM_RENDERER_LOCK: file_name = part.get_metadata('original_file_name') or 'unknown.pdf' pdf_doc = pdfium.PdfDocument(part.bytes) try: return self._format(file_name, pdf_doc) finally: pdf_doc.close() def match(self, part: content_api.ProcessorPart) -> bool: return is_pdf(part) async def call( self, part: content_api.ProcessorPart ) -> AsyncIterable[content_api.ProcessorPart]: content, stats = await asyncio.to_thread(self._process, part) filename, pages, pages_with_images = stats yield processor.status( 'Parsed PDF' f' {filename} ({pages} pages,' f' {pages_with_images} pages with images)' ) for part in content.all_parts: yield part

core/pdf.py (68 lines of code) (raw):