in fraud-detection-python/cloud-functions/process-invoices/main.py [0:0]
def extract_document_entities(document: documentai.Document) -> dict:
"""
Get all entities from a document and output as a dictionary
Flattens nested entities/properties
Format: entity.type_: entity.mention_text OR entity.normalized_value.text
"""
document_entities: Dict[str, Any] = {}
def extract_document_entity(entity: documentai.Document.Entity):
"""
Extract Single Entity and Add to Entity Dictionary
"""
entity_key = entity.type_.replace("/", "_")
normalized_value = getattr(entity, "normalized_value", None)
new_entity_value = (
normalized_value.text if normalized_value else entity.mention_text
)
existing_entity = document_entities.get(entity_key)
# For entities that can have multiple (e.g. line_item)
if existing_entity:
# Change Entity Type to a List
if not isinstance(existing_entity, list):
existing_entity = list([existing_entity])
existing_entity.append(new_entity_value)
document_entities[entity_key] = existing_entity
else:
document_entities.update({entity_key: new_entity_value})
for entity in document.entities:
# Fields detected. For a full list of fields for each processor see
# the processor documentation:
# https://cloud.google.com/document-ai/docs/processors-list
extract_document_entity(entity)
# Properties are Sub-Entities
for prop in entity.properties:
extract_document_entity(prop)
return document_entities