# Image Segmentation

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

## Objective

This document provides a step-by-step guide on how to process a PDF file containing multiple images separated by white spaces, extract individual images, and save each image as a separate page in a new PDF. 

## Prerequisites
* Python : Jupyter Notebook (Vertex).
* Storage Bucket.

## Step by Step Procedure

### 1. Import Modules/Packages

In [None]:
# Run this cell to download utilities module
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install google-cloud-documentai google-cloud-storage
!pip install opencv-python-headless fpdf pdf2image

In [None]:
from google.cloud import storage
import cv2
import numpy as np
import os
from fpdf import FPDF
from pdf2image import convert_from_path

from utilities import file_names

### 2. Input Details

* **input_file_path**: Provide the gcs path of the parent folder where the sub-folders contain input files. Please follow the folder structure described earlier.
* **output_file_path**: Provide gcs path where the output json files have to be saved

In [None]:
input_file_path = "gs://<<bucket_name>>/<<input_pdf_images>>/"
output_file_path = "gs://<<bucket_name>>/<<output_pdf_images>>/"

### 3.Run the required functions

In [None]:
def download_pdf_from_gcs(
    bucket_name: str, source_blob_name: str, destination_file_name: str
) -> None:
    """
    Download a PDF file from a Google Cloud Storage (GCS) bucket to local storage.

    Parameters:
    bucket_name (str): Name of the GCS bucket.
    source_blob_name (str): Name of the blob (file) in the GCS bucket.
    destination_file_name (str): Path to save the downloaded file locally.

    Returns:
    None
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)
    print(f"PDF file {source_blob_name} downloaded to {destination_file_name}.")


def upload_pdf_to_gcs(
    bucket_name: str, source_file_name: str, destination_blob_name: str
) -> None:
    """
    Upload a PDF file from local storage to a GCS bucket.

    Parameters:
    bucket_name (str): Name of the GCS bucket.
    source_file_name (str): Path of the local file to be uploaded.
    destination_blob_name (str): Name of the blob (file) in the GCS bucket.

    Returns:
    None
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)
    print(f"PDF file {source_file_name} uploaded to {destination_blob_name}.")


def split_pdf_images_into_pages(input_pdf_path: str, output_pdf_path: str) -> None:
    """
    Split images in a PDF into separate pages and save the output as a new PDF.

    Parameters:
    input_pdf_path (str): Path to the input PDF file.
    output_pdf_path (str): Path to save the output PDF file.

    Returns:
    None
    """
    temp_folder = "temp_images"
    if not os.path.exists(temp_folder):
        os.makedirs(temp_folder)

    pages = convert_from_path(input_pdf_path, dpi=300)
    pdf = FPDF()

    for page_number, page_image in enumerate(pages):
        page_array = np.array(page_image)
        color_image = cv2.cvtColor(page_array, cv2.COLOR_RGB2BGR)

        gray_image = cv2.cvtColor(color_image, cv2.COLOR_BGR2GRAY)
        _, binary = cv2.threshold(gray_image, 240, 255, cv2.THRESH_BINARY)
        inverted_binary = cv2.bitwise_not(binary)

        contours, _ = cv2.findContours(
            inverted_binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )

        bounding_boxes = [cv2.boundingRect(c) for c in contours]
        sorted_bounding_boxes = sorted(bounding_boxes, key=lambda x: x[1])

        min_width, min_height = 100, 100
        for i, (x, y, w, h) in enumerate(sorted_bounding_boxes):
            if w > min_width and h > min_height:
                cropped_image = color_image[y : y + h, x : x + w]
                temp_image_path = os.path.join(
                    temp_folder, f"temp_image_{page_number}_{i}.png"
                )
                cv2.imwrite(temp_image_path, cropped_image)

                pdf.add_page()
                pdf.image(temp_image_path, x=10, y=10, w=190)

    pdf.output(output_pdf_path)
    print(f"Output PDF created: {output_pdf_path}")


def process_pdf_in_gcs(
    input_bucket_name: str,
    input_blob_name: str,
    output_bucket_name: str,
    output_blob_name: str,
) -> None:
    """
    Process a PDF stored in a GCS bucket by splitting its images into pages
    and saving the output back to a GCS bucket.

    Parameters:
    input_bucket_name (str): Name of the input GCS bucket.
    input_blob_name (str): Name of the input blob (file) in the GCS bucket.
    output_bucket_name (str): Name of the output GCS bucket.
    output_blob_name (str): Name of the output blob (file) in the GCS bucket.

    Returns:
    None
    """
    local_input_pdf = "input.pdf"
    local_output_pdf = "output.pdf"

    download_pdf_from_gcs(input_bucket_name, input_blob_name, local_input_pdf)
    split_pdf_images_into_pages(local_input_pdf, local_output_pdf)
    upload_pdf_to_gcs(output_bucket_name, local_output_pdf, output_blob_name)

    os.remove(local_input_pdf)
    os.remove(local_output_pdf)

### 4.Run the code

In [None]:
if __name__ == "__main__":
    input_bucket = input_file_path.split("/")[2]
    output_bucket = output_file_path.split("/")[2]

    input_blob_files = list(file_names(input_file_path)[1].values())

    for input_blob in input_blob_files:
        output_blob = (
            "/".join(output_file_path.split("/")[3:]) + input_blob.split("/")[-1]
        )
        process_pdf_in_gcs(input_bucket, input_blob, output_bucket, output_blob)
    print("Splitting for all the files are done.")

### Output Details

### Before Splitting
<img src='./images/before_splitting.png' width=400 height=600 alt="Sample Output"></img>
### After Splitting
<img src='./images/after_splitting.png' width=400 height=600 alt="Sample Output"></img>