pdf-splitter-python/main.py (139 lines of code) (raw):

# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """This module defines a CLI that uses Document AI to split a PDF document""" import argparse import os import sys from typing import Sequence from google.api_core.client_options import ClientOptions import google.auth from google.cloud.documentai import Document from google.cloud.documentai import DocumentProcessorServiceClient from google.cloud.documentai import Processor from google.cloud.documentai import ProcessRequest from google.cloud.documentai import RawDocument from pikepdf import Pdf DEFAULT_MULTI_REGION_LOCATION = "us" DEFAULT_PROCESSOR_TYPE = "LENDING_DOCUMENT_SPLIT_PROCESSOR" PDF_MIME_TYPE = "application/pdf" PDF_EXTENSION = ".pdf" def main(args: argparse.Namespace) -> int: """This project splits a PDF document using the Document AI API to identify split points""" if not args.project_id: _, project_id = google.auth.default() args.project_id = project_id file_path = os.path.abspath(args.input) if not os.path.isfile(file_path): print(f"Could not find file at {file_path}") return 1 if PDF_EXTENSION not in args.input: print(f"Input file {args.input} is not a PDF") return 1 if not args.output_dir: args.output_dir = os.path.dirname(file_path) client = DocumentProcessorServiceClient( client_options=ClientOptions( api_endpoint=f"{args.multi_region_location}-documentai.googleapis.com" ) ) processor_name = get_or_create_processor( client, args.project_id, args.multi_region_location, args.split_processor_type ) print( "Using:\n" f'* Project ID: "{args.project_id}"\n' f'* Location: "{args.multi_region_location}"\n' f'* Processor Name "{processor_name}"\n' f'* Input PDF "{os.path.basename(file_path)}"\n' f'* Output directory: "{args.output_dir}"\n' ) document = online_process(client, processor_name, file_path) document_json = write_document_json(document, file_path, output_dir=args.output_dir) print(f"Document AI Output: {document_json}") split_pdf(document.entities, file_path, output_dir=args.output_dir) print("Done.") return 0 def get_or_create_processor( client: DocumentProcessorServiceClient, project_id: str, location: str, processor_type: str, ) -> str: """ Searches for a processor name for a given processor type. Creates processor if one doesn't exist """ parent = client.common_location_path(project_id, location) for processor in client.list_processors(parent=parent): if processor.type_ == processor_type: # Processor names have the form: # `projects/{project}/locations/{location}/processors/{processor_id}` # See https://cloud.google.com/document-ai/docs/create-processor for more information. return processor.name print( f"No split processor found. " f'creating new processor of type "{processor_type}"', ) processor = client.create_processor( parent=parent, processor=Processor(display_name=processor_type, type_=processor_type), ) return processor.name def online_process( client: DocumentProcessorServiceClient, processor_name: str, file_path: str, mime_type: str = PDF_MIME_TYPE, ) -> Document: """ Call the specified processors process document API with the contents of # the input PDF file as input. """ with open(file_path, "rb") as pdf_file: result = client.process_document( request=ProcessRequest( name=processor_name, raw_document=RawDocument(content=pdf_file.read(), mime_type=mime_type), ) ) return result.document def write_document_json(document: Document, file_path: str, output_dir: str) -> str: """ Write Document object as JSON file """ # File Path: output_dir/file_name.json output_filepath = os.path.join( output_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}.json" ) with open(output_filepath, "w", encoding="utf-8") as json_file: json_file.write( Document.to_json(document, including_default_value_fields=False) ) return output_filepath def split_pdf(entities: Sequence[Document.Entity], file_path: str, output_dir: str): """ Create subdocuments based on Splitter/Classifier output """ with Pdf.open(file_path) as original_pdf: # Create New PDF for each SubDocument print(f"Total subdocuments: {len(entities)}") for index, entity in enumerate(entities): start = int(entity.page_anchor.page_refs[0].page) end = int(entity.page_anchor.page_refs[-1].page) subdoc_type = entity.type_ or "subdoc" if start == end: page_range = f"pg{start + 1}" else: page_range = f"pg{start + 1}-{end + 1}" output_filename = f"{page_range}_{subdoc_type}" print(f"Creating subdocument {index + 1}: {output_filename}") subdoc = Pdf.new() for page_num in range(start, end + 1): subdoc.pages.append(original_pdf.pages[page_num]) subdoc.save( os.path.join( output_dir, f"{output_filename}_{os.path.basename(file_path)}", ), min_version=original_pdf.pdf_version, ) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Split a PDF document.") parser.add_argument( "-i", "--input", help="filepath of input PDF to split", required=True ) parser.add_argument( "--output-dir", help="directory to save subdocuments, default: input PDF directory", ) parser.add_argument( "--project-id", help="Project ID to use to call the Document AI API" ) parser.add_argument( "--multi-region-location", help="multi-regional location for document storage and processing", default=DEFAULT_MULTI_REGION_LOCATION, ) parser.add_argument( "--split-processor-type", help='type of split processor e.g. "LENDING_DOCUMENT_SPLIT_PROCESSOR"', default=DEFAULT_PROCESSOR_TYPE, ) sys.exit(main(parser.parse_args()))