tax-processing-pipeline-python/docai_utils.py (91 lines of code) (raw):

# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Document AI Utility Functions""" from typing import Dict, List, Optional, Sequence, Tuple from consts import CLASSIFIER_PROCESSOR_TYPES from consts import DEFAULT_MIME_TYPE from consts import DOCAI_ACTIVE_PROCESSORS from consts import DOCAI_PROCESSOR_LOCATION from consts import DOCAI_PROJECT_ID from consts import DOCUMENT_SUPPORTED_PROCESSOR_TYPES from google.api_core.client_options import ClientOptions from google.cloud import documentai_v1 as documentai client_options = ClientOptions( api_endpoint=f"{DOCAI_PROCESSOR_LOCATION}-documentai.googleapis.com" ) # Instantiates a client documentai_client = documentai.DocumentProcessorServiceClient( client_options=client_options ) def process_document( project_id: str, location: str, processor_id: str, file_content: Optional[bytes] = None, inline_document: Optional[documentai.Document] = None, mime_type: str = DEFAULT_MIME_TYPE, ) -> documentai.Document: """ Processes a document using the Document AI API. Takes in bytes from file reading, instead of a file path """ # The full resource name of the processor, e.g.: # projects/project-id/locations/location/processor/processor-id # You must create new processors in the Cloud Console first resource_name = documentai_client.processor_path(project_id, location, processor_id) # Configure the process request request = documentai.ProcessRequest(name=resource_name) if file_content: # Load Binary Data into Document AI RawDocument Object request.raw_document = documentai.RawDocument( content=file_content, mime_type=mime_type ) elif inline_document: request.inline_document = inline_document else: return None # Use the Document AI client to process the sample form result = documentai_client.process_document(request=request) return result.document def extract_document_entities(document: documentai.Document) -> Dict[str, str]: """ Get all entities from a document and output as a dictionary Format: entity.type_: entity.mention_text OR entity.normalized_value.text """ # For a full list of fields for each processor see # the processor documentation: # https://cloud.google.com/document-ai/docs/processors-list # Use EKG Enriched Data if available return { entity.type_: entity.normalized_value.text if hasattr(entity, "normalized_value") else entity.mention_text for entity in document.entities } def select_processor_from_classification( document_classification: str = "other", ) -> Tuple[str, str]: """ Select Processor for a given Document Classification """ # Get Supported Parser Processor Type from Document Classification processor_type = DOCUMENT_SUPPORTED_PROCESSOR_TYPES.get( document_classification, "FORM_PARSER_PROCESSOR" ) # Get Specific Processor ID for this Parser Type processor_id = DOCAI_ACTIVE_PROCESSORS.get(processor_type) return processor_type, processor_id def classify_document(file_content: bytes, mime_type: str) -> str: """ Classify a single document with all available specialized processors """ # Cycle through all possible classifier Processor Types for classifier_processor_type in CLASSIFIER_PROCESSOR_TYPES: # Get Specific Processor ID for this Classifier Type classifier_processor_id = DOCAI_ACTIVE_PROCESSORS.get(classifier_processor_type) if not classifier_processor_id: continue # Classify Document classification_document_proto = process_document( DOCAI_PROJECT_ID, DOCAI_PROCESSOR_LOCATION, classifier_processor_id, file_content=file_content, mime_type=mime_type, ) # Translate Classification Output to Processor Type document_classification = classification_document_proto.entities[0].type_ # Specialized Classifiers return "other" # if it could not classify to a known type if document_classification == "other": continue return document_classification def get_processor_id(path: str): """ Extract Processor ID (Hexadecimal Number) from full processor path """ return documentai_client.parse_processor_path(path)["processor"] def fetch_processor_types( project_id: str, location: str ) -> Sequence[documentai.ProcessorType]: """ Returns a list of processor types enabled for the given project. """ response = documentai_client.fetch_processor_types( parent=documentai_client.common_location_path(project_id, location) ) return response.processor_types def create_processor( project_id: str, location: str, display_name: str, processor_type: str ) -> documentai.Processor: """ Creates a new processor. """ processor_info = documentai.Processor( display_name=display_name, type_=processor_type ) return documentai_client.create_processor( parent=documentai_client.common_location_path(project_id, location), processor=processor_info, ) def list_processors(project_id: str, location: str) -> List[documentai.Processor]: """Lists existing processors.""" return list( documentai_client.list_processors( parent=documentai_client.common_location_path(project_id, location), ) )