community/codelabs/docai-specialized-processors/extraction.py (53 lines of code) (raw):

# mypy: disable-error-code="1" """ Sends a request to a Document AI Specialized Parser Processor """ from google.cloud import documentai_v1 as documentai import pandas as pd def online_process( project_id: str, location: str, processor_id: str, file_path: str, mime_type: str, ) -> documentai.Document: """ Processes a document using the Document AI Online Processing API. """ opts = {"api_endpoint": f"{location}-documentai.googleapis.com"} # Instantiates a client documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts) # The full resource name of the processor, e.g.: # projects/project-id/locations/location/processor/processor-id # You must create new processors in the Cloud Console first resource_name = documentai_client.processor_path(project_id, location, processor_id) # Read the file into memory with open(file_path, "rb") as file: file_content = file.read() # Load Binary Data into Document AI RawDocument Object raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type) # Configure the process request request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document) # Use the Document AI client to process the sample form result = documentai_client.process_document(request=request) return result.document PROJECT_ID = "YOUR_PROJECT_ID" LOCATION = "YOUR_PROJECT_LOCATION" # Format is 'us' or 'eu' PROCESSOR_ID = "INVOICE_PARSER_ID" # Create processor in Cloud Console # The local file in your current working directory FILE_PATH = "google_invoice.pdf" # Refer to https://cloud.google.com/document-ai/docs/processors-list # for supported file types MIME_TYPE = "application/pdf" document = online_process( project_id=PROJECT_ID, location=LOCATION, processor_id=PROCESSOR_ID, file_path=FILE_PATH, mime_type=MIME_TYPE, ) types = [] raw_values = [] normalized_values = [] confidence = [] # Grab each key/value pair and their corresponding confidence scores. for entity in document.entities: types.append(entity.type_) raw_values.append(entity.mention_text) normalized_values.append(entity.normalized_value.text) confidence.append(f"{entity.confidence:.0%}") # Get Properties (Sub-Entities) with confidence scores for prop in entity.properties: types.append(prop.type_) raw_values.append(prop.mention_text) normalized_values.append(prop.normalized_value.text) confidence.append(f"{prop.confidence:.0%}") # Create a Pandas Dataframe to print the values in tabular format. df = pd.DataFrame( { "Type": types, "Raw Value": raw_values, "Normalized Value": normalized_values, "Confidence": confidence, } ) print(df)