# Character Box Removal

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

## Objective

This document is intended as a guide to help with pre-processing PDFs with values inside of character boxes to remove the character boxes completely. This will help mitigate OCR confusion caused by the lines of the character boxes.

## Prerequisites
* Vertex AI Notebook.
* Permission For Google DocAI Processors, Storage and Vertex AI Notebook.

## Step by Step Procedure

### 1. Import Modules/Packages

In [None]:
# Run this cell to download utilities module
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install opencv-python matplotlib numpy pillow
!pip install pdf2image

In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
import os
from google.cloud import storage
import os
from typing import List, Union

### 2. Input Details

* **local_folder_path**: Local directory containing input PDFs
* **local_output_folder_path**: Local directory where output PDFs store
* **bucket_name**: Bucket Name of GCS Storage
* **gcs_folder_path**: Path to GCS directory where processed files will be stored

In [None]:
local_folder_path = "<input-path>"  # Local directory containing input PDFs
local_output_folder_path = "<output-path>"  # Local directory where output PDFs store
bucket_name = "<bucket-name>"  # Replace with your GCS bucket name
gcs_folder_path = (
    "<output-directory>"  # Path to GCS directory where processed files will be stored
)

### 3. Run the required functions

In [None]:
def box_fun(image_path: str) -> np.ndarray:
    """
    Processes an image to detect and remove lines, enhance text quality, and improve resolution.

    Args:
        image_path (str): Path to the input image.

    Returns:
        np.ndarray: The processed image as a NumPy array.
    """
    # Read the image
    image = cv2.imread(image_path)

    # Increase the resolution by resizing the image
    image = cv2.resize(image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

    # Create a copy of the original image
    result = image.copy()

    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(
        gray,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV,
        21,  # Adjusted block size
        5,  # Adjusted constant
    )

    # Create kernels for line detection
    horizontal_kernel = cv2.getStructuringElement(
        cv2.MORPH_RECT, (50, 1)
    )  # Adjusted size
    vertical_kernel = cv2.getStructuringElement(
        cv2.MORPH_RECT, (1, 45)
    )  # Adjusted size

    # Detect horizontal and vertical lines
    horizontal_lines = cv2.morphologyEx(
        thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2
    )
    vertical_lines = cv2.morphologyEx(
        thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2
    )

    # Combine horizontal and vertical lines
    lines = cv2.addWeighted(horizontal_lines, 0.5, vertical_lines, 0.6, 0.0)

    # Dilate the lines to make them more prominent
    lines = cv2.dilate(
        lines, cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)), iterations=2
    )  # Adjusted dilation

    # Remove the lines from the original image
    result = cv2.inpaint(result, lines, 5, cv2.INPAINT_TELEA)  # Adjusted radius

    # Optional: Apply Gaussian blur for smoothing
    result = cv2.GaussianBlur(result, (3, 3), 0)

    # Enhance text quality with sharpening
    sharpen_kernel = np.array(
        [[-1, -1, -1], [-1, 10, -1], [-1, -1, -1]]
    )  # Adjusted kernel
    result = cv2.filter2D(result, -1, sharpen_kernel)  # Apply sharpening once

    # Display the images using matplotlib
    plt.figure(figsize=(15, 5))

    plt.subplot(1, 3, 1)
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.title("Original Image")

    plt.subplot(1, 3, 2)
    plt.imshow(lines, cmap="gray")
    plt.title("Detected Lines")

    plt.subplot(1, 3, 3)
    plt.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
    plt.title("Result Image")

    plt.tight_layout()
    plt.show()

    return result


def process_pdf(pdf_path: str) -> List[Union[Image.Image, None]]:
    """
    Processes each page of a PDF to remove lines, enhance quality, and save the results as a new PDF.

    Args:
        pdf_path (str): Path to the input PDF file.

    Returns:
        List[Union[Image.Image, None]]: List of processed images for each page.
    """
    print(pdf_path)
    pages = convert_from_path(pdf_path)
    processed_images = []

    for i, page in enumerate(pages):
        # Save each page as a temporary PNG
        temp_image_path = f"temp_page_{i}.png"
        page.save(temp_image_path, "PNG")

        # Process the image
        processed_image = box_fun(temp_image_path)

        if isinstance(processed_image, np.ndarray):
            processed_image = Image.fromarray(processed_image.astype("uint8"))

        processed_images.append(processed_image)

        os.remove(temp_image_path)

    filename = pdf_path.split(".pdf")[0].split("/")[-1]
    output_pdf_path = f"{local_output_folder_path}/improved_{filename}.pdf"
    print(output_pdf_path)

    processed_images[0].save(
        output_pdf_path, save_all=True, append_images=processed_images[1:]
    )

    return processed_images


def upload_files_to_gcs(
    local_folder_path: str, bucket_name: str, gcs_folder_path: str
) -> None:
    """
    Uploads all files from a local folder to a specified Google Cloud Storage (GCS) folder.

    Args:
        local_folder_path (str): The local folder path containing files to upload.
        bucket_name (str): The name of the GCS bucket.
        gcs_folder_path (str): The destination folder path in GCS.
    """
    # Initialize the GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # List all files in the local folder
    files = [
        f
        for f in os.listdir(local_folder_path)
        if os.path.isfile(os.path.join(local_folder_path, f))
    ]

    print("Uploading files to GCS Bucket")
    for file_name in files:
        # Full path to the local file
        local_file_path = os.path.join(local_folder_path, file_name)

        # GCS path (include the file name in the destination)
        gcs_blob_path = os.path.join(gcs_folder_path, file_name)

        # Create a blob in the bucket and upload the file
        blob = bucket.blob(gcs_blob_path)
        blob.upload_from_filename(local_file_path)

        print(f"Uploaded {file_name} to gs://{bucket_name}/{gcs_blob_path}")

### 4. Run the code

In [None]:
if __name__ == "__main__":
    # For Local Input Files
    files = [f"{local_folder_path}/{f}" for f in os.listdir(local_folder_path)]

    for file in files:
        if file.endswith(".pdf") or file.endswith(".PDF"):
            process_pdf(file)

    upload_files_to_gcs(local_output_folder_path, bucket_name, gcs_folder_path)

### Output Details

### Before Tooling
<img src='./images/before_tooling.png' width=400 height=600 alt="Sample Output"></img>
### After Tooling
<img src='./images/after_tooling.png' width=400 height=600 alt="Sample Output"></img>