# Header and Footer entities

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

# Objective
The objective is to extract headers and footers from documents using OCR by leveraging bounding boxes at the top and bottom of each page. 

# Prerequisites
* Vertex AI Notebook
* GCS Folder Path
* DocumentAI Parsed JSONs

# Step-by-Step Procedure

## 1. Import Modules/Packages

In [7]:
# !pip install google-cloud-documentai

In [8]:
# Run this cell to download utilities module
# !wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [1]:
from google.cloud import documentai_v1beta3 as documentai

from utilities import (
    documentai_json_proto_downloader,
    file_names,
    store_document_as_json,
)

## 2. Input Details

* **GCS_INPUT_URI** : It is input GCS folder path which contains DocumentAI processor JSON results
* **GCS_OUTPUT_URI** : It is a GCS folder path to store post-processing results
* **Y_STOP_GAP_HEADER** : Minimum gap to check between the header and general text
* **Y_STOP_GAP_FOOTER**: Minimum gap to check between the footer and general text
* **Y_HEADER_BORDER** : Header text  end maximum position (eg: 0.06)
* **Y_FOOTER_BORDER**: Footer text start position (eg: 0.90)

In [6]:
GCS_INPUT_URI = "gs://BUCKET/header_footer_entities/input/"
GCS_OUTPUT_URI = "gs://BUCKET/header_footer_entities/output/"

# configurable parameters
Y_STOP_GAP_HEADER = 0.0260
Y_STOP_GAP_FOOTER = 0.001
# OR
Y_HEADER_BORDER = None  # 0.06
Y_FOOTER_BORDER = None  # 0.90

In [7]:
GCS_INPUT_URI = "gs://siddamv/tools/header_footer_entities/input/"
GCS_OUTPUT_URI = "gs://siddamv/tools/header_footer_entities/output/"

# configurable parameters
Y_STOP_GAP_HEADER = 0.0260  # minimum gap to check between the header and general text
Y_STOP_GAP_FOOTER = 0.001  # minimum gap to check between the footer and general text
# OR
Y_HEADER_BORDER = None  # 0.06 #Header text  end maximum position
Y_FOOTER_BORDER = None  # 0.90  #Footer text start position

## 3. Run Below Code-Cells

In [9]:
def add_header_footer_entities(json_data: documentai.Document) -> documentai.Document:
    """
    It will create header and footer entities if any text falls under predefined variables region

    Args:
        json_data (documentai.Document): DocumentAI Processor result in Document object format

    Returns:
        documentai.Document: Post processed Document object
    """

    for page in json_data.pages:
        page_num = page.page_number
        y_list = []
        for token in page.tokens:
            vertices = token.layout.bounding_poly.normalized_vertices
            minx_b, miny_b = min(point.x for point in vertices), min(
                point.y for point in vertices
            )
            maxx_b, maxy_b = max(point.x for point in vertices), max(
                point.y for point in vertices
            )
            y_list.append(miny_b)
        y_list_sorted = sorted(y_list)
        max_y_header = None
        header_text_anchors = []
        header_page_x = []
        header_page_y = []
        footer_text_anchors = []
        footer_page_x = []
        footer_page_y = []
        for i in range(len(y_list_sorted) - 1):
            if y_list_sorted[i + 1] - y_list_sorted[i] > Y_STOP_GAP_HEADER:
                max_y_header = y_list_sorted[i + 1]
                break
        min_y_header = None
        for i in range(len(y_list_sorted) - 1, 0, -1):
            if y_list_sorted[i] - y_list_sorted[i - 1] > Y_STOP_GAP_FOOTER:
                min_y_header = y_list_sorted[i - 1]
                break
        header_text = ""
        footer_text = ""
        if Y_HEADER_BORDER != None:
            max_y_header = Y_HEADER_BORDER
        if Y_FOOTER_BORDER != None:
            min_y_header = Y_FOOTER_BORDER
        for token in page.tokens:
            vertices = token.layout.bounding_poly.normalized_vertices
            minx_b, miny_b = min(point.x for point in vertices), min(
                point.y for point in vertices
            )
            maxx_b, maxy_b = max(point.x for point in vertices), max(
                point.y for point in vertices
            )
            if miny_b < max_y_header:
                anc = token.layout.text_anchor.text_segments
                for text_anc in anc:
                    start_index = text_anc.start_index
                    end_index = text_anc.end_index
                    header_text += json_data.text[start_index:end_index]
                    header_ts = documentai.Document.TextAnchor.TextSegment(
                        start_index=start_index, end_index=end_index
                    )
                    header_text_anchors.append(header_ts)
                header_page_x.extend([minx_b, maxx_b])
                header_page_y.extend([miny_b, maxy_b])
            if miny_b > min_y_header:
                anc = token.layout.text_anchor.text_segments
                for text_anc in anc:
                    start_index = text_anc.start_index
                    end_index = text_anc.end_index
                    footer_text += json_data.text[start_index:end_index]
                    footer_ts = documentai.Document.TextAnchor.TextSegment(
                        start_index=start_index, end_index=end_index
                    )
                    footer_text_anchors.append(footer_ts)
                footer_page_x.extend([minx_b, maxx_b])
                footer_page_y.extend([miny_b, maxy_b])
        sorted_footer_text_anchors = sorted(
            footer_text_anchors, key=lambda x: x.end_index
        )
        sorted_header_text_anchors = sorted(
            header_text_anchors, key=lambda x: x.end_index
        )
        header_mention_text = ""
        for an1 in sorted_header_text_anchors:
            header_mention_text += json_data.text[an1.start_index : an1.end_index]
        footer_mention_text = ""
        for an1 in sorted_footer_text_anchors:
            footer_mention_text += json_data.text[an1.start_index : an1.end_index]
        try:
            normalized_vertex_0 = documentai.NormalizedVertex(
                x=min(header_page_x), y=min(header_page_y)
            )
            normalized_vertex_1 = documentai.NormalizedVertex(
                x=max(header_page_x), y=min(header_page_y)
            )
            normalized_vertex_2 = documentai.NormalizedVertex(
                x=min(header_page_x), y=max(header_page_y)
            )
            normalized_vertex_3 = documentai.NormalizedVertex(
                x=max(header_page_x), y=max(header_page_y)
            )
            header_norm_ver = [
                normalized_vertex_0,
                normalized_vertex_1,
                normalized_vertex_2,
                normalized_vertex_3,
            ]
            header_entity = documentai.Document.Entity()
            header_entity.mention_text = header_mention_text
            header_entity.type = "header"
            bp = documentai.BoundingPoly(normalized_vertices=header_norm_ver)
            pr = documentai.Document.PageAnchor.PageRef(
                page=str(page_num - 1), bounding_poly=bp
            )
            pa = documentai.Document.PageAnchor()
            pa.page_refs = [pr]
            header_entity.page_anchor = pa
            header_entity.text_anchor.text_segments = header_text_anchors
            json_data.entities.append(header_entity)

        except ValueError:
            print("NO HEADER page_number", page_num)
            continue
        try:
            normalized_vertex_0 = documentai.NormalizedVertex(
                x=min(footer_page_x), y=min(footer_page_y)
            )
            normalized_vertex_1 = documentai.NormalizedVertex(
                x=max(footer_page_x), y=min(footer_page_y)
            )
            normalized_vertex_2 = documentai.NormalizedVertex(
                x=min(footer_page_x), y=max(footer_page_y)
            )
            normalized_vertex_3 = documentai.NormalizedVertex(
                x=max(footer_page_x), y=max(footer_page_y)
            )
            footer_norm_ver = [
                normalized_vertex_0,
                normalized_vertex_1,
                normalized_vertex_2,
                normalized_vertex_3,
            ]
            footer_entity = documentai.Document.Entity()
            footer_entity.mention_text = footer_mention_text
            footer_entity.type = "footer"
            bp = documentai.BoundingPoly(normalized_vertices=footer_norm_ver)
            pr = documentai.Document.PageAnchor.PageRef(
                page=str(page_num - 1), bounding_poly=bp
            )
            pa = documentai.Document.PageAnchor()
            pa.page_refs = [pr]
            footer_entity.page_anchor = pa
            footer_entity.text_anchor.text_segments = footer_text_anchors
            json_data.entities.append(footer_entity)
        except ValueError:
            print("NO FOOTER page_number", page_num)
            continue
    return json_data


# getting list of files and path from GCS
file_names_list, file_names_dict = file_names(GCS_INPUT_URI)
# looping each file for adding footer and header entities
for i in range(len(file_names_list)):
    print("file_name: ", file_names_list[i])
    file_name = file_names_list[i]
    json_data = documentai_json_proto_downloader(
        GCS_INPUT_URI.split("/")[2], file_names_dict[file_names_list[i]]
    )
    y = 0
    json_data = add_header_footer_entities(json_data)
    print(f"\tUploading file {file_name} to GCS {GCS_OUTPUT_URI}")
    store_document_as_json(
        documentai.Document.to_json(json_data),
        GCS_OUTPUT_URI.split("/")[2],
        ("/").join(GCS_OUTPUT_URI.split("/")[3:]) + file_name,
    )

file_name:  AWS DPA-0.json
	Uploading file AWS DPA-0.json to GCS gs://siddamv/tools/header_footer_entities/output/


# 4. Output Details

Refer below images for postprocessed results

<img src='./images/output.png' width=1000 height=800></img>