# PDF Table Identification

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

## Objective

This Python script automates the identification and extraction of pages containing tables from PDF documents, streamlining the pre-processing workflow.

## Considerations and Limitations

The efficacy of the script may vary depending on the complexity of the documents. Image quality, lighting, and contrast variations can also affect performance. Manual intervention or specialized techniques may be required for optimal results.
There's a potential risk of data misinterpretation during the table identification process. A page containing a table may be missed, or a text dense page may be interpreted as containing a table due to the rectangular text shape.


## Prerequisites
* Access to vertex AI Notebook or Google Colab
* GCS bucket for processing of  the input files and output files

## Step by Step procedure

### 1.Importing Required Modules

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [12]:
import cv2
import numpy as np
import pymupdf as fitz
from google.cloud import storage
import io
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

### 2.Setup the inputs

In [None]:
input_bucket_name = "<input-bucket>"
output_bucket_name = "<output-bucket>"
document_path = "Table Identification Test/Test_Table_ID.pdf"
output_full_path = "Table Identification Test/output"

### 3.Run the required functions

In [None]:
def find_tables_in_page(image: np.ndarray) -> list:
    """
    Finds tables in an image and returns their contours.

    Args:
        image (np.ndarray): The input image to process (in BGR format).

    Returns:
        list: A list of contours that potentially represent tables in the image.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 4
    )

    # Detect horizontal lines
    horizontal_kernel = cv2.getStructuringElement(
        cv2.MORPH_RECT, (40, 1)
    )  # Reduced kernel width
    detect_horizontal = cv2.morphologyEx(
        thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2
    )

    # Detect vertical lines
    vertical_kernel = cv2.getStructuringElement(
        cv2.MORPH_RECT, (1, 40)
    )  # Reduced kernel height
    detect_vertical = cv2.morphologyEx(
        thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2
    )

    combined = cv2.bitwise_or(detect_horizontal, detect_vertical)

    # Dilate the image to close gaps between lines
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    combined = cv2.dilate(
        combined, kernel, iterations=3
    )  # Reduced iterations for finer control

    contours, hierarchy = cv2.findContours(
        combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )

    table_contours = []
    for contour in contours:
        area = cv2.contourArea(contour)
        x, y, w, h = cv2.boundingRect(contour)
        aspect_ratio = float(w) / h if h != 0 else float("inf")

        # Improved table contour filtering:
        if (
            area > 1000 and 0.5 <= aspect_ratio <= 5
        ):  # Adjusted area threshold, reasonable aspect ratio
            table_contours.append(contour)

    return table_contours


def identify_table_pages(doc: fitz.Document, dpi: int = 300) -> list:
    """
    Identifies pages in a PDF document that contain tables.

    Args:
        doc (fitz.Document): The PDF document to analyze.
        dpi (int, optional): Dots per inch for the resolution of the images extracted from the PDF. Defaults to 300.

    Returns:
        list: A list of page indices where tables are found.
    """
    table_pages = []

    for page_index in range(len(doc)):
        page = doc.load_page(page_index)
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)
        img_bytes = pix.tobytes("jpeg")
        image = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
        table_contours = find_tables_in_page(image)
        if table_contours:  # Simplified condition
            print(f"Table found on page {page_index + 1}")
            table_pages.append(page_index)
    return table_pages


def read_image_from_bytecode(img_bytes: bytes) -> np.ndarray:
    """
    Reads an image from a bytecode and returns it as a NumPy array.

    Args:
        img_bytes (bytes): The image data in bytecode format.

    Returns:
        np.ndarray: The decoded image in BGR format, suitable for OpenCV processing.
    """
    return cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)


def document_downloader(bucket_name: str, blob_name_with_prefix_path: str) -> bytes:
    """
    Downloads a document from a Google Cloud Storage bucket and returns its byte content.

    Args:
        bucket_name (str): The name of the Cloud Storage bucket.
        blob_name_with_prefix_path (str): The path and name of the blob in the bucket.

    Returns:
        bytes: The downloaded document in byte format.
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name_with_prefix_path)
    doc = blob.download_as_bytes()
    return doc


def save_pdf_to_bucket(
    pdf_bytes: bytes, bucket_name: str, destination_blob_name: str
) -> None:
    """
    Saves a PDF to a Google Cloud Storage bucket.

    Args:
        pdf_bytes (bytes): The byte content of the PDF.
        bucket_name (str): The name of the Cloud Storage bucket.
        destination_blob_name (str): The destination path and name for the blob in the bucket.

    Returns:
        None
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_string(pdf_bytes, content_type="application/pdf")

### 4.Run the code

In [None]:
file_name = document_path.split("/")[-1]
output_file_name = f"{file_name}_tables.pdf"
document_bytes = document_downloader(input_bucket_name, document_path)
pdf_document = fitz.open("pdf", document_bytes)
table_pages = identify_table_pages(pdf_document)


# Create a new PDF document with only the pages that have tables
output_pdf = fitz.open()
for page_index in table_pages:
    output_pdf.insert_pdf(pdf_document, from_page=page_index, to_page=page_index)


# Save the new PDF to a BytesIO object
output_pdf_bytes = io.BytesIO()
output_pdf.save(output_pdf_bytes)
output_pdf_bytes.seek(0)


# Save the new PDF to GCS
save_pdf_to_bucket(
    output_pdf_bytes.read(),
    output_bucket_name,
    output_full_path + "/" + output_file_name,
)
print(
    f"PDF with extracted tables saved to: gs://{output_bucket_name}/{output_full_path}/{output_file_name}"
)

### 5.Output

This code extracts pages containing tables from a document and saves them as separate PDFs in a specified output location.
