in document-ai/code/main.py [0:0]
def process_document(bucket_name, object_name):
"""Process a document stored in GCS."""
print("Document processing started.")
client = documentai_v1beta3.DocumentProcessorServiceClient()
# Download file
file_path = "/tmp/{}".format(object_name)
print("download document to..."+file_path)
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(object_name)
blob.download_to_filename(file_path)
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Set the document content in the request
document = {"content": image_content, "mime_type": blob.content_type}
# Configure the process request
processor_name = os.getenv("FORM_PARSER_PROCESSOR")
if not processor_name:
print("Environment variable FORM_PARSER_PROCESSOR not set")
return
request = {"name": processor_name, "document": document}
# Use the Document AI client to process the request
result = client.process_document(request=request)
document = result.document
document_text = document.text
# Extract key value pairs
document_pages = document.pages
document_dict = {}
for page in document_pages:
for form_field in page.form_fields:
fieldName = get_text(form_field.field_name, document)
fieldValue = get_text(form_field.field_value, document)
document_dict[f"{fieldName}"] = fieldValue
# Extract Summary
# Set the document content in the request
document = {"content": image_content, "mime_type": blob.content_type}
print("Summarizing Document")
summary_processor_name = os.getenv("SUMMARY_PROCESSOR")
if not summary_processor_name:
print("Environment variable SUMMARY_PROCESSOR not set")
return
summary_request = {"name": summary_processor_name, "document": document}
summary_result = client.process_document(request=summary_request)
document = summary_result.document
summary_text = document.entities[0].mention_text
print("Document processing complete.")
process_output(bucket_name, object_name, document_text, summary_text, document_dict)