def process_document()

in document-ai/code/main.py [0:0]


def process_document(bucket_name, object_name):
    """Process a document stored in GCS."""
    print("Document processing started.")
    client = documentai_v1beta3.DocumentProcessorServiceClient()

    # Download file
    file_path = "/tmp/{}".format(object_name)
    print("download document to..."+file_path)
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(object_name)
    blob.download_to_filename(file_path)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()
    # Set the document content in the request
    document = {"content": image_content, "mime_type": blob.content_type}

    # Configure the process request
    processor_name = os.getenv("FORM_PARSER_PROCESSOR")
    if not processor_name:
        print("Environment variable FORM_PARSER_PROCESSOR not set")
        return

    request = {"name": processor_name, "document": document}

    # Use the Document AI client to process the request
    result = client.process_document(request=request)
    document = result.document
    document_text = document.text

    # Extract key value pairs
    document_pages = document.pages
    document_dict = {}
    for page in document_pages:
        for form_field in page.form_fields:
            fieldName = get_text(form_field.field_name, document)
            fieldValue = get_text(form_field.field_value, document)
            document_dict[f"{fieldName}"] = fieldValue

    # Extract Summary
    # Set the document content in the request
    document = {"content": image_content, "mime_type": blob.content_type}
    print("Summarizing Document")    
    summary_processor_name = os.getenv("SUMMARY_PROCESSOR")
    if not summary_processor_name:
        print("Environment variable SUMMARY_PROCESSOR not set")
        return

    summary_request = {"name": summary_processor_name, "document": document}
    summary_result = client.process_document(request=summary_request)
    document = summary_result.document
    summary_text = document.entities[0].mention_text
    print("Document processing complete.")
    process_output(bucket_name, object_name, document_text, summary_text, document_dict)