def json_to_dataframe()

in incubator-tools/best-practices/utilities/utilities.py [0:0]


def json_to_dataframe(data: documentai.Document) -> pd.DataFrame:
    """
    Converts a loaded DocumentAI proto JSON into a pandas DataFrame.

    Args:
        data (json object): A loaded DocumentAI Document proto JSON.

    Returns:
        pandas.DataFrame: A DataFrame representation of the JSON with columns
                          ['type_', 'mention_text', 'bbox', 'page'].
                          'type_' column indicates the type of entity.
                          'mention_text' column contains the text of the entity or its property.
                          'bbox' column contains bounding box coordinates.
                          'page' column indicates the page number where the entity is found.
    """

    df = pd.DataFrame(columns=["type_", "mention_text", "bbox", "page"])

    try:
        for entity in data.entities:
            # First, we'll assume it doesn't have properties
            has_properties = False

            # Check for subentities (properties)
            try:
                for subentity in entity.properties:
                    has_properties = True  # Mark that we found properties
                    try:
                        df = get_entity_metadata(df, subentity)
                    except (AttributeError, Exception) as e:
                        print(e)
                        continue

            except (AttributeError, Exception) as e:
                print(f"Exception encountered: {e}")
                continue

            # If no properties were found for the entity, add it to the dataframe
            if not has_properties:
                try:
                    df = get_entity_metadata(df, entity)
                except (AttributeError, Exception) as e:
                    print(f"Exception encountered: {e}")
                    continue

        return df
    except (AttributeError, Exception) as e:
        print(f"Exception encountered: {e}")
        return df