# Document Schema from Form Parser Output

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.

## Objective

This Document guides to create document schema from key value pairs of form parser output in csv format (The generated schema can be reviewed, updated). This schema can be used to update for any parser through API.


## Prerequisites
* Python : Jupyter notebook (Vertex AI) 
* Service account permissions in projects.
* GCS Folder Path which has form parser parsed jsons


## Step by Step procedure 

### 1.Importing Required Modules

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
import json
import re
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from tqdm import tqdm
from utilities import *
import pandas as pd
from collections import Counter, defaultdict
import csv

### 2.Setup the required inputs
* `project_id` : Your Google project id or name
* `formparser_parsed_jsons_path` : GCS storegar path where the form parser output is saved

In [None]:
project_id = "xxx-xxxx-xxxx"  # your project id
formparser_parsed_jsons_path = "gs://xxxx/xxxx/xxx/"  # path of the form parser output

### 3.Importing Required functions

In [None]:
def get_schema_file(json_dict: object) -> List[Dict[str, Any]]:
    """
    Extracts schema information from Document AI output and returns a list of entities with their types and occurrences.

    Args:
        json_dict (object): The OCR output in the form of a Document AI document.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries representing entities with their types, occurrences, mention text, and value type.
    """

    entities_kv = []
    for page_number, page_data in enumerate(json_dict.pages):
        for formField_number, formField_data in enumerate(
            getattr(page_data, "form_fields", [])
        ):
            # Cleaning the entity name
            key_name = re.sub(
                r"[^\w\s]",
                "",
                formField_data.field_name.text_anchor.content.replace(" ", "_")
                .lower()
                .strip(),
            )
            if key_name[-1] == "_":
                key_name = key_name[:-1]
            if key_name:
                # print(formField_data.field_value.bounding_poly.normalized_vertices)
                ent_xy = {"x": [], "y": []}
                text_anc = []
                for xy in formField_data.field_value.bounding_poly.normalized_vertices:
                    ent_xy["x"].append(xy.x)
                    ent_xy["y"].append(xy.y)
                for anc in formField_data.field_value.text_anchor.text_segments:
                    text_anc.append(
                        {"start_index": anc.start_index, "end_index": anc.end_index}
                    )

                page_anc_1 = [
                    {"x": min(ent_xy["x"]), "y": min(ent_xy["y"])},
                    {"x": min(ent_xy["x"]), "y": max(ent_xy["y"])},
                    {"x": max(ent_xy["x"]), "y": min(ent_xy["y"])},
                    {"x": max(ent_xy["x"]), "y": max(ent_xy["y"])},
                ]

                entity_new = {
                    "confidence": formField_data.field_value.confidence,
                    "mention_text": formField_data.field_value.text_anchor.content,
                    "page_anchor": {
                        "page_refs": [
                            {
                                "bounding_poly": {"normalized_vertices": page_anc_1},
                                "page": str(page_number),
                            }
                        ]
                    },
                    "text_anchor": {"text_segments": text_anc},
                    "type": key_name,
                }

                entities_kv.append(entity_new)

    file_schema = []
    ent_considered = []
    keys_dict = {}

    for entity in entities_kv:
        if entity["type"] in keys_dict:
            keys_dict[entity["type"]] = "OPTIONAL_MULTIPLE"
        else:
            keys_dict[entity["type"]] = "OPTIONAL_ONCE"

    for ent in entities_kv:
        if ent["type"] not in ent_considered:
            ent_considered.append(ent["type"])
            temp_ent = {
                "entity_type": ent["type"],
                "occurrence": keys_dict[ent["type"]],
                "entity_mention_text": ent["mention_text"],
                "value_type": "string",
            }
            file_schema.append(temp_ent)

    return file_schema


def get_consolidated_schema(data: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
    """
    Consolidates schema information from a list of entities and returns a list of dictionaries with majority occurrence
    and value type for each entity type.

    Args:
        data (List[List[Dict[str, Any]]]): A list of entities, where each entity is represented by a dictionary.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries representing consolidated schema information for each entity type.
    """

    counters = {}
    for item in data:
        for entity in item:
            entity_type = entity["entity_type"]
            occurrence = entity["occurrence"]
            value_type = entity["value_type"]

            if entity_type not in counters:
                counters[entity_type] = {
                    "occurrence": Counter(),
                    "value_type": Counter(),
                }

            counters[entity_type]["occurrence"][occurrence] += 1
            counters[entity_type]["value_type"][value_type] += 1

    # Create a new list of dictionaries with majority occurrence and value type for each entity type
    result = []
    for entity_type, counts in counters.items():
        majority_occurrence = counts["occurrence"].most_common(1)[0][0]
        majority_value_type = counts["value_type"].most_common(1)[0][0]

        result.append(
            {
                "entity_type": entity_type,
                "occurrence": majority_occurrence,
                "value_type": majority_value_type,
            }
        )
    df = pd.DataFrame(result)

    df.to_csv("document_schema.csv")

    return result


def get_allfiles_csv(data: Dict[str, List[Dict[str, Any]]]) -> None:
    """
    Groups entities by filename and writes the data to a CSV file.

    Args:
        data (Dict[str, List[Dict[str, Any]]]): A dictionary where keys are filenames and values are lists of entities.

    Returns:
        None
    """

    grouped_data = defaultdict(list)
    for file_name, entities in data.items():
        grouped_data[file_name].extend(entities)

    csv_file_path = "Allfiles_data.csv"
    header = [
        "filename",
        "entity_type",
        "occurrence",
        "entity_mention_text",
        "value_type",
    ]

    with open(csv_file_path, "w", newline="") as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=header)
        csv_writer.writeheader()
        csv_writer.writerows(
            {"filename": file_name, **entity}
            for file_name, entities in grouped_data.items()
            for entity in entities
        )

### 4. Calling functions

In [None]:
# calling functions
bucket_name = formparser_parsed_jsons_path.split("/")[2]
files = list(file_names(formparser_parsed_jsons_path)[1].values())
list_schema = []
file_wise = {}
for file in tqdm(files, desc="Status : "):
    json_dict = documentai_json_proto_downloader(bucket_name, file)
    file_schema = get_schema_file(json_dict)
    list_schema.append(file_schema)
    file_wise[file.split("/")[-1]] = file_schema
# if you need data for all files individually to review uncomment below line
# get_allfiles_csv(file_wise)
consolidated_schema = get_consolidated_schema(list_schema)

### 5.CSV schema output

Form parser output in UI

<img src="./Images/Form_parser_output.png" width=800 height=400 alt="Form_parser_output"></img>

Retrieved schema from code in the form of csv(‘document_schema.csv’)

<img src="./Images/CSV_output.png" width=800 height=400 alt="CSV_output"></img>

### The above schema can be reviewed or modified as per the user requirements.

## Updating Schema to another parser

### 1.Setup the required inputs
* `project_id` : Your Google project id or name
* `location_processor` : Location of processor
* `processor_id` : to which schema has to be updated
* `updated_schema_csv_path` : csv file modified or reviewed from above step

In [None]:
project_number = "XXXXXXXXXXXXXXXX"  # project number
location_processor = "us"  # location of processor
processor_id = "xxxxxxxxxxxxxxxx"  # to which schema has to be updated
updated_schema_csv_path = (
    "document_schema.csv"  # csv file modified or reviewed from above step
)

### Required functions

In [None]:
# helper functions
# get document schema
def get_dataset_schema(processor_name: str) -> Any:
    """
    Retrieves the dataset schema for a specified processor.

    Args:
        processor_name (str): The name of the processor.

    Returns:
        Any: The response containing the dataset schema information.
    """

    # Create a client
    from google.cloud import documentai_v1beta3

    client = documentai_v1beta3.DocumentServiceClient()

    # dataset_name = client.dataset_schema_path(project, location, processor)
    # Initialize request argument(s)
    request = documentai_v1beta3.GetDatasetSchemaRequest(
        name=processor_name + "/dataset/datasetSchema",
    )

    # Make the request
    response = client.get_dataset_schema(request=request)

    return response


# update schema
def update_dataset_schema(schema: document.Document):
    """
    Updates the dataset schema.

    Args:
        schema (document.Document): The document containing the updated dataset schema.

    Returns:
        document.Document: The response containing the updated dataset schema information.
    """

    from google.cloud import documentai_v1beta3

    # Create a client
    client = documentai_v1beta3.DocumentServiceClient()

    # Initialize request argument(s)
    request = documentai_v1beta3.UpdateDatasetSchemaRequest(
        dataset_schema={"name": schema.name, "document_schema": schema.document_schema}
    )
    # Make the request
    response = client.update_dataset_schema(request=request)

    # Handle the response
    return response

### Calling functions

In [None]:
# updating schema of processor
import pandas as pd

df_updated = pd.read_csv(updated_schema_csv_path)
schema_updated = []
for m in range(len(df_updated)):
    schema_ent = {
        "name": df_updated.loc[m]["entity_type"],
        "value_type": df_updated.loc[m]["value_type"],
        "occurrence_type": df_updated.loc[m]["occurrence"],
    }
    schema_updated.append(schema_ent)

response_schema = get_dataset_schema(
    f"projects/{project_number}/locations/{location_processor}/processors/{processor_id}"
)

for i in response_schema.document_schema.entity_types:
    for e3 in schema_updated:
        i.properties.append(e3)

response = update_dataset_schema(response_schema)

### output
The above script adds the schema in the parser as below

<img src="./Images/processor_output.png" width=800 height=400 alt="processor_output"></img>