# Line Item Improver Crosspage

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

## Objective

This tool uses parsed json files and merges the line items spanning across 2 pages which are supposed to be under a single line item and updates the json.

## Prerequisites
* Jupyter Platform to run Python code
* Parsed json files in GCS Folder
* Output GCS folder to upload the updated json files


## Step by Step Procedure

### 1. Import Modules/Packages

In [None]:
# Run this cell to download utilities module
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install google-cloud-documentai google-cloud-storage

In [None]:
from google.cloud import storage
from pathlib import Path
from tqdm import tqdm
from google.cloud import documentai_v1beta3 as documentai
from typing import Any, Dict, List, Tuple, Set

from utilities import file_names, store_document_as_json

### 2. Input Details

* **gcs_input_path**: Provide the gcs path of the parent folder where the sub-folders contain input files. Please follow the folder structure described earlier.
* **project_id**: Project ID/Number of the Project.
* **line_item_name**: Parent entity name of the document which needs to be get merge.
* **gcs_output_path**: Provide gcs path where the output json files have to be saved

In [None]:
gcs_input_path = "gs://<<bucket_name>>/<<subfolder_path>>/"  # Parsed json files path , end '/' is mandatory
project_id = "project_id"  # project ID
line_item_name = "parent_entity_name"  # Name of the line item entity (parent entity) to be merged as per processor schema
gcs_output_path = "gs://<<bucket_name>>/<<subfolder_path>>/"  # output path where the updated jsons to be saved, end '/' is mandatory

### 3. Run the required functions

In [None]:
def get_page_wise_sorted_line_items_and_schema(
    json_dict: Dict[str, Any]
) -> Tuple[Dict[str, List[Dict[str, Any]]], List[str]]:
    """
    Extracts and sorts line items by page and gathers a schema of unique property types.

    Args:
    - json_dict (Dict[str, Any]): The JSON dictionary containing entity data.

    Returns:
    - Tuple[Dict[str, List[Dict[str, Any]]], List[str]]:
        - A dictionary where keys are page numbers and values are lists of sorted line items for that page.
        - A sorted list of unique property types across all line items.
    """
    line_items_page = {}
    for entity in json_dict["entities"]:
        page = "0"
        try:
            if "page" in entity["pageAnchor"]["pageRefs"][0].keys():
                page = entity["pageAnchor"]["pageRefs"][0]["page"]

            if entity["type"] == line_item_name:
                if "properties" in entity.keys() and entity["properties"]:
                    if page in line_items_page:
                        line_items_page[page].append(entity)
                    else:
                        line_items_page[page] = [entity]
        except Exception:
            pass

    sorted_line_items = {}
    schema_types = set()  # Collect all unique types across line items
    for page, line_items in line_items_page.items():
        sorted_line_items[page] = sort_line_items_y(
            line_items
        )  # Sort line items by y-coordinate
        for item in line_items:
            for property_item in item["properties"]:
                schema_types.add(
                    property_item.get("type")
                )  # Collect unique property types

    return sorted_line_items, sorted(schema_types)


def sort_line_items_y(line_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Sorts line items by the minimum y-coordinate of their bounding polygons.

    Args:
    - line_items (List[Dict[str, Any]]): A list of line items to be sorted.

    Returns:
    - List[Dict[str, Any]]: A sorted list of line items by their y-coordinate.
    """
    return sorted(
        line_items,
        key=lambda item: min(
            vertex["y"]
            for vertex in item["pageAnchor"]["pageRefs"][0]["boundingPoly"][
                "normalizedVertices"
            ]
        ),
    )


def merge_line_items_cross_page(
    line_items_cur_page: List[Dict[str, Any]],
    line_items_next_page: List[Dict[str, Any]],
    li_schema: List[str],
) -> Dict[str, Any]:
    """
    Merges line items across pages if they belong to the same logical entity.

    Args:
    - line_items_cur_page (List[Dict[str, Any]]): List of line items from the current page.
    - line_items_next_page (List[Dict[str, Any]]): List of line items from the next page.
    - li_schema (List[str]): List of unique property types in the schema.

    Returns:
    - Dict[str, Any]: The merged line item entity, or None if no merge is required.
    """
    last_li_cur_page = line_items_cur_page[-1]
    first_li_next_page = line_items_next_page[0]

    # Check for common entities between last and first line items
    value_counts = {li_type: 0 for li_type in li_schema}
    for child_i, child_j in zip(
        last_li_cur_page["properties"], first_li_next_page["properties"]
    ):
        value_counts[child_i["type"]] += 1
        value_counts[child_j["type"]] += 1

    for count in value_counts.values():
        if count > 1:
            return None  # No merge required

    # Collect missing child entities from the schema
    missing_child_entities = []
    for li_type in li_schema:
        found = False
        for child in last_li_cur_page["properties"]:
            if li_type == child["type"]:
                found = True
                break
        if not found:
            missing_child_entities.append(li_type)

    # Merge the line items
    merged_entity = dict(last_li_cur_page)
    for li_type in missing_child_entities:
        for child in first_li_next_page["properties"]:
            if li_type == child["type"]:
                merged_entity["properties"].append(child)
                break

    # Update bounding polygon, text anchors, and mention text
    merged_entity["pageAnchor"]["pageRefs"].extend(
        first_li_next_page["pageAnchor"]["pageRefs"]
    )
    merged_entity["textAnchor"]["textSegments"].extend(
        first_li_next_page["textAnchor"]["textSegments"]
    )
    merged_entity["textAnchor"]["content"] += (
        "\n" + first_li_next_page["textAnchor"]["content"]
    )
    merged_entity["mentionText"] += " " + first_li_next_page["mentionText"]

    return merged_entity

### 4. Run the code

In [None]:
if __name__ == "__main__":
    file_names_list, file_dict = file_names(gcs_input_path)
    storage_client = storage.Client()
    source_bucket = storage_client.bucket(gcs_input_path.split("/")[2])

    for filename, filepath in tqdm(file_dict.items(), desc="Progress"):
        input_bucket_name = gcs_input_path.split("/")[2]
        if ".json" in filepath:
            output_bucket_name = gcs_output_path.split("/")[2]
            print(filename)
            json_dict = json.loads(
                source_bucket.blob(filepath).download_as_string().decode("utf-8")
            )

            # Flag to track whether or not to update the json_dict after merging, if no line items are merged, this flag will remain False
            is_entity_updated = False

            # get lineitems by pages
            (
                line_items_pages,
                line_items_schema,
            ) = get_page_wise_sorted_line_items_and_schema(json_dict)
            print("Schema: ", line_items_schema)

            # Merging the cross page line items
            for i in range(len(line_items_pages)):
                try:
                    cur_page_li = line_items_pages[str(i)]
                    next_page_li = line_items_pages[str(i + 1)]
                    merged_li = merge_line_items_cross_page(
                        cur_page_li, next_page_li, line_items_schema
                    )

                    if merged_li is not None:
                        is_entity_updated = True
                        line_items_pages[str(i)].pop()
                        line_items_pages[str(i)].append(merged_li)

                        line_items_pages[str(i + 1)].pop(0)
                except:
                    pass
                    # print("No line items in page : ", i)

            if is_entity_updated:
                # Updating the entities in original json
                updated_line_items = []

                for i in range(len(line_items_pages)):
                    try:
                        updated_line_items.extend(line_items_pages[str(i)])
                    except:
                        pass
                        # print("No line items to update in page : ", i)

                updated_entities = []
                for entity in json_dict["entities"]:
                    if entity["type"] != line_item_name:
                        updated_entities.append(entity)

                updated_entities.extend(updated_line_items)

                updated_entities = sort_line_items_y(updated_entities)
                json_dict["entities"] = updated_entities
                print("Line items merged successfully")
            else:
                print("No line items were merged")
            store_document_as_json(
                json.dumps(json_dict),
                output_bucket_name,
                "/".join(gcs_output_path.split("/")[3:]) + filename,
            )

### Output Details

### Before
<img src='./images/before_1.png' width=600 height=800 alt="Sample Output"></img>
<img src='./images/before_2.png' width=600 height=800 alt="Sample Output"></img>
### After
<img src='./images/after_1.png' width=600 height=800 alt="Sample Output"></img>
<img src='./images/after_2.png' width=600 height=800 alt="Sample Output"></img>