in incubator-tools/best-practices/utilities/utilities.py [0:0]
def json_to_dataframe(data: documentai.Document) -> pd.DataFrame:
"""
Converts a loaded DocumentAI proto JSON into a pandas DataFrame.
Args:
data (json object): A loaded DocumentAI Document proto JSON.
Returns:
pandas.DataFrame: A DataFrame representation of the JSON with columns
['type_', 'mention_text', 'bbox', 'page'].
'type_' column indicates the type of entity.
'mention_text' column contains the text of the entity or its property.
'bbox' column contains bounding box coordinates.
'page' column indicates the page number where the entity is found.
"""
df = pd.DataFrame(columns=["type_", "mention_text", "bbox", "page"])
try:
for entity in data.entities:
# First, we'll assume it doesn't have properties
has_properties = False
# Check for subentities (properties)
try:
for subentity in entity.properties:
has_properties = True # Mark that we found properties
try:
df = get_entity_metadata(df, subentity)
except (AttributeError, Exception) as e:
print(e)
continue
except (AttributeError, Exception) as e:
print(f"Exception encountered: {e}")
continue
# If no properties were found for the entity, add it to the dataframe
if not has_properties:
try:
df = get_entity_metadata(df, entity)
except (AttributeError, Exception) as e:
print(f"Exception encountered: {e}")
continue
return df
except (AttributeError, Exception) as e:
print(f"Exception encountered: {e}")
return df