documentai/snippets/handle_response_sample.py (343 lines of code) (raw):

# Copyright 2020 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # [START documentai_process_ocr_document] # [START documentai_process_form_document] # [START documentai_process_specialized_document] # [START documentai_process_splitter_document] # [START documentai_process_layout_document] # [START documentai_process_custom_extractor_document] from typing import Optional, Sequence from google.api_core.client_options import ClientOptions from google.cloud import documentai # TODO(developer): Uncomment these variables before running the sample. # project_id = "YOUR_PROJECT_ID" # location = "YOUR_PROCESSOR_LOCATION" # Format is "us" or "eu" # processor_id = "YOUR_PROCESSOR_ID" # Create processor before running sample # processor_version = "rc" # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information # file_path = "/path/to/local/pdf" # mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types # [END documentai_process_ocr_document] # [END documentai_process_form_document] # [END documentai_process_specialized_document] # [END documentai_process_splitter_document] # [END documentai_process_layout_document] # [END documentai_process_custom_extractor_document] # [START documentai_process_ocr_document] def process_document_ocr_sample( project_id: str, location: str, processor_id: str, processor_version: str, file_path: str, mime_type: str, ) -> None: # Optional: Additional configurations for Document OCR Processor. # For more information: https://cloud.google.com/document-ai/docs/enterprise-document-ocr process_options = documentai.ProcessOptions( ocr_config=documentai.OcrConfig( enable_native_pdf_parsing=True, enable_image_quality_scores=True, enable_symbol=True, # OCR Add Ons https://cloud.google.com/document-ai/docs/ocr-add-ons premium_features=documentai.OcrConfig.PremiumFeatures( compute_style_info=True, enable_math_ocr=False, # Enable to use Math OCR Model enable_selection_mark_detection=True, ), ) ) # Online processing request to Document AI document = process_document( project_id, location, processor_id, processor_version, file_path, mime_type, process_options=process_options, ) text = document.text print(f"Full document text: {text}\n") print(f"There are {len(document.pages)} page(s) in this document.\n") for page in document.pages: print(f"Page {page.page_number}:") print_page_dimensions(page.dimension) print_detected_languages(page.detected_languages) print_blocks(page.blocks, text) print_paragraphs(page.paragraphs, text) print_lines(page.lines, text) print_tokens(page.tokens, text) if page.symbols: print_symbols(page.symbols, text) if page.image_quality_scores: print_image_quality_scores(page.image_quality_scores) if page.visual_elements: print_visual_elements(page.visual_elements, text) def print_page_dimensions(dimension: documentai.Document.Page.Dimension) -> None: print(f" Width: {str(dimension.width)}") print(f" Height: {str(dimension.height)}") def print_detected_languages( detected_languages: Sequence[documentai.Document.Page.DetectedLanguage], ) -> None: print(" Detected languages:") for lang in detected_languages: print(f" {lang.language_code} ({lang.confidence:.1%} confidence)") def print_blocks(blocks: Sequence[documentai.Document.Page.Block], text: str) -> None: print(f" {len(blocks)} blocks detected:") first_block_text = layout_to_text(blocks[0].layout, text) print(f" First text block: {repr(first_block_text)}") last_block_text = layout_to_text(blocks[-1].layout, text) print(f" Last text block: {repr(last_block_text)}") def print_paragraphs( paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str ) -> None: print(f" {len(paragraphs)} paragraphs detected:") first_paragraph_text = layout_to_text(paragraphs[0].layout, text) print(f" First paragraph text: {repr(first_paragraph_text)}") last_paragraph_text = layout_to_text(paragraphs[-1].layout, text) print(f" Last paragraph text: {repr(last_paragraph_text)}") def print_lines(lines: Sequence[documentai.Document.Page.Line], text: str) -> None: print(f" {len(lines)} lines detected:") first_line_text = layout_to_text(lines[0].layout, text) print(f" First line text: {repr(first_line_text)}") last_line_text = layout_to_text(lines[-1].layout, text) print(f" Last line text: {repr(last_line_text)}") def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) -> None: print(f" {len(tokens)} tokens detected:") first_token_text = layout_to_text(tokens[0].layout, text) first_token_break_type = tokens[0].detected_break.type_.name print(f" First token text: {repr(first_token_text)}") print(f" First token break type: {repr(first_token_break_type)}") if tokens[0].style_info: print_style_info(tokens[0].style_info) last_token_text = layout_to_text(tokens[-1].layout, text) last_token_break_type = tokens[-1].detected_break.type_.name print(f" Last token text: {repr(last_token_text)}") print(f" Last token break type: {repr(last_token_break_type)}") if tokens[-1].style_info: print_style_info(tokens[-1].style_info) def print_symbols( symbols: Sequence[documentai.Document.Page.Symbol], text: str ) -> None: print(f" {len(symbols)} symbols detected:") first_symbol_text = layout_to_text(symbols[0].layout, text) print(f" First symbol text: {repr(first_symbol_text)}") last_symbol_text = layout_to_text(symbols[-1].layout, text) print(f" Last symbol text: {repr(last_symbol_text)}") def print_image_quality_scores( image_quality_scores: documentai.Document.Page.ImageQualityScores, ) -> None: print(f" Quality score: {image_quality_scores.quality_score:.1%}") print(" Detected defects:") for detected_defect in image_quality_scores.detected_defects: print(f" {detected_defect.type_}: {detected_defect.confidence:.1%}") def print_style_info(style_info: documentai.Document.Page.Token.StyleInfo) -> None: """ Only supported in version `pretrained-ocr-v2.0-2023-06-02` """ print(f" Font Size: {style_info.font_size}pt") print(f" Font Type: {style_info.font_type}") print(f" Bold: {style_info.bold}") print(f" Italic: {style_info.italic}") print(f" Underlined: {style_info.underlined}") print(f" Handwritten: {style_info.handwritten}") print( f" Text Color (RGBa): {style_info.text_color.red}, {style_info.text_color.green}, {style_info.text_color.blue}, {style_info.text_color.alpha}" ) def print_visual_elements( visual_elements: Sequence[documentai.Document.Page.VisualElement], text: str ) -> None: """ Only supported in version `pretrained-ocr-v2.0-2023-06-02` """ checkboxes = [x for x in visual_elements if "checkbox" in x.type] math_symbols = [x for x in visual_elements if x.type == "math_formula"] if checkboxes: print(f" {len(checkboxes)} checkboxes detected:") print(f" First checkbox: {repr(checkboxes[0].type)}") print(f" Last checkbox: {repr(checkboxes[-1].type)}") if math_symbols: print(f" {len(math_symbols)} math symbols detected:") first_math_symbol_text = layout_to_text(math_symbols[0].layout, text) print(f" First math symbol: {repr(first_math_symbol_text)}") # [END documentai_process_ocr_document] # [START documentai_process_form_document] def process_document_form_sample( project_id: str, location: str, processor_id: str, processor_version: str, file_path: str, mime_type: str, ) -> documentai.Document: # Online processing request to Document AI document = process_document( project_id, location, processor_id, processor_version, file_path, mime_type ) # Read the table and form fields output from the processor # The form processor also contains OCR data. For more information # on how to parse OCR data please see the OCR sample. text = document.text print(f"Full document text: {repr(text)}\n") print(f"There are {len(document.pages)} page(s) in this document.") # Read the form fields and tables output from the processor for page in document.pages: print(f"\n\n**** Page {page.page_number} ****") print(f"\nFound {len(page.tables)} table(s):") for table in page.tables: num_columns = len(table.header_rows[0].cells) num_rows = len(table.body_rows) print(f"Table with {num_columns} columns and {num_rows} rows:") # Print header rows print("Columns:") print_table_rows(table.header_rows, text) # Print body rows print("Table body data:") print_table_rows(table.body_rows, text) print(f"\nFound {len(page.form_fields)} form field(s):") for field in page.form_fields: name = layout_to_text(field.field_name, text) value = layout_to_text(field.field_value, text) print(f" * {repr(name.strip())}: {repr(value.strip())}") # Supported in version `pretrained-form-parser-v2.0-2022-11-10` and later. # For more information: https://cloud.google.com/document-ai/docs/form-parser if document.entities: print(f"Found {len(document.entities)} generic entities:") for entity in document.entities: print_entity(entity) # Print Nested Entities for prop in entity.properties: print_entity(prop) return document def print_table_rows( table_rows: Sequence[documentai.Document.Page.Table.TableRow], text: str ) -> None: for table_row in table_rows: row_text = "" for cell in table_row.cells: cell_text = layout_to_text(cell.layout, text) row_text += f"{repr(cell_text.strip())} | " print(row_text) # [END documentai_process_form_document] # [START documentai_process_specialized_document] def process_document_entity_extraction_sample( project_id: str, location: str, processor_id: str, processor_version: str, file_path: str, mime_type: str, ) -> None: # Online processing request to Document AI document = process_document( project_id, location, processor_id, processor_version, file_path, mime_type ) # Print extracted entities from entity extraction processor output. # For a complete list of processors see: # https://cloud.google.com/document-ai/docs/processors-list # # OCR and other data is also present in the processor's response. # Refer to the OCR samples for how to parse other data in the response. print(f"Found {len(document.entities)} entities:") for entity in document.entities: print_entity(entity) # Print Nested Entities (if any) for prop in entity.properties: print_entity(prop) # [END documentai_process_specialized_document] # [START documentai_process_custom_extractor_document] def process_document_custom_extractor_sample( project_id: str, location: str, processor_id: str, processor_version: str, file_path: str, mime_type: str, ) -> None: # Entities to extract from Foundation Model CDE properties = [ documentai.DocumentSchema.EntityType.Property( name="invoice_id", value_type="string", occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.REQUIRED_ONCE, ), documentai.DocumentSchema.EntityType.Property( name="notes", value_type="string", occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.OPTIONAL_MULTIPLE, ), documentai.DocumentSchema.EntityType.Property( name="terms", value_type="string", occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.OPTIONAL_MULTIPLE, ), ] # Optional: For Generative AI processors, request different fields than the # schema for a processor version process_options = documentai.ProcessOptions( schema_override=documentai.DocumentSchema( display_name="CDE Schema", description="Document Schema for the CDE Processor", entity_types=[ documentai.DocumentSchema.EntityType( name="custom_extraction_document_type", base_types=["document"], properties=properties, ) ], ) ) # Online processing request to Document AI document = process_document( project_id, location, processor_id, processor_version, file_path, mime_type, process_options=process_options, ) for entity in document.entities: print_entity(entity) # Print Nested Entities (if any) for prop in entity.properties: print_entity(prop) # [START documentai_process_form_document] # [START documentai_process_specialized_document] def print_entity(entity: documentai.Document.Entity) -> None: # Fields detected. For a full list of fields for each processor see # the processor documentation: # https://cloud.google.com/document-ai/docs/processors-list key = entity.type_ # Some other value formats in addition to text are available # e.g. dates: `entity.normalized_value.date_value.year` text_value = entity.text_anchor.content or entity.mention_text confidence = entity.confidence normalized_value = entity.normalized_value.text print(f" * {repr(key)}: {repr(text_value)} ({confidence:.1%} confident)") if normalized_value: print(f" * Normalized Value: {repr(normalized_value)}") # [END documentai_process_form_document] # [END documentai_process_specialized_document] # [END documentai_process_custom_extractor_document] # [START documentai_process_splitter_document] def process_document_splitter_sample( project_id: str, location: str, processor_id: str, processor_version: str, file_path: str, mime_type: str, ) -> None: # Online processing request to Document AI document = process_document( project_id, location, processor_id, processor_version, file_path, mime_type ) # Read the splitter output from a document splitter/classifier processor: # e.g. https://cloud.google.com/document-ai/docs/processors-list#processor_procurement-document-splitter # This processor only provides text for the document and information on how # to split the document on logical boundaries. To identify and extract text, # form elements, and entities please see other processors like the OCR, form, # and specalized processors. print(f"Found {len(document.entities)} subdocuments:") for entity in document.entities: conf_percent = f"{entity.confidence:.1%}" pages_range = page_refs_to_string(entity.page_anchor.page_refs) # Print subdocument type information, if available if entity.type_: print( f"{conf_percent} confident that {pages_range} a '{entity.type_}' subdocument." ) else: print(f"{conf_percent} confident that {pages_range} a subdocument.") def page_refs_to_string( page_refs: Sequence[documentai.Document.PageAnchor.PageRef], ) -> str: """Converts a page ref to a string describing the page or page range.""" pages = [str(int(page_ref.page) + 1) for page_ref in page_refs] if len(pages) == 1: return f"page {pages[0]} is" else: return f"pages {', '.join(pages)} are" # [END documentai_process_splitter_document] # [START documentai_process_layout_document] def process_document_layout_sample( project_id: str, location: str, processor_id: str, processor_version: str, file_path: str, mime_type: str, ) -> documentai.Document: process_options = documentai.ProcessOptions( layout_config=documentai.ProcessOptions.LayoutConfig( chunking_config=documentai.ProcessOptions.LayoutConfig.ChunkingConfig( chunk_size=1000, include_ancestor_headings=True, ) ) ) document = process_document( project_id, location, processor_id, processor_version, file_path, mime_type, process_options=process_options, ) print("Document Layout Blocks") for block in document.document_layout.blocks: print(block) print("Document Chunks") for chunk in document.chunked_document.chunks: print(chunk) # [END documentai_process_layout_document] return document # [START documentai_process_ocr_document] # [START documentai_process_form_document] # [START documentai_process_specialized_document] # [START documentai_process_splitter_document] # [START documentai_process_layout_document] # [START documentai_process_custom_extractor_document] def process_document( project_id: str, location: str, processor_id: str, processor_version: str, file_path: str, mime_type: str, process_options: Optional[documentai.ProcessOptions] = None, ) -> documentai.Document: # You must set the `api_endpoint` if you use a location other than "us". client = documentai.DocumentProcessorServiceClient( client_options=ClientOptions( api_endpoint=f"{location}-documentai.googleapis.com" ) ) # The full resource name of the processor version, e.g.: # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}` # You must create a processor before running this sample. name = client.processor_version_path( project_id, location, processor_id, processor_version ) # Read the file into memory with open(file_path, "rb") as image: image_content = image.read() # Configure the process request request = documentai.ProcessRequest( name=name, raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type), # Only supported for Document OCR processor process_options=process_options, ) result = client.process_document(request=request) # For a full list of `Document` object attributes, reference this page: # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document return result.document # [END documentai_process_specialized_document] # [END documentai_process_splitter_document] # [END documentai_process_layout_document] # [END documentai_process_custom_extractor_document] def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str: """ Document AI identifies text in different parts of the document by their offsets in the entirety of the document"s text. This function converts offsets to a string. """ # If a text segment spans several lines, it will # be stored in different text segments. return "".join( text[int(segment.start_index) : int(segment.end_index)] for segment in layout.text_anchor.text_segments ) # [END documentai_process_form_document] # [END documentai_process_ocr_document]