In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Batch Processing with Document AI Toolbox

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/document-ai-samples/blob/main/toolbox-batch-processing/documentai-toolbox-batch-entity-extraction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fdocument-ai-samples%2Fmain%2Ftoolbox-batch-processing%2Fdocumentai-toolbox-batch-entity-extraction.ipynb">
      <img width="32px" src="https://storage.googleapis.com/github-repo/colab_enterprise.svg" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/document-ai-samples/blob/main/toolbox-batch-processing/documentai-toolbox-batch-entity-extraction.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/blob/main/toolbox-batch-processing/documentai-toolbox-batch-entity-extraction.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>


[Document AI Toolbox](https://cloud.google.com/document-ai/docs/toolbox) is an SDK for Python that provides utility
functions for managing, manipulating, and extracting information from the [`Document`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document) object.

It creates a ["wrapped" document object](https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document) from a processed document response from JSON files in
Cloud Storage, local JSON files, or output directly from the [`process_document()`](https://cloud.google.com/document-ai/docs/reference/rest/v1/projects.locations.processors/process) method.

It can perform the following actions:

- Combine fragmented [`Document`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document) JSON files from Batch Processing into a single ["wrapped" document](https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document).
  - Export shards as a unified [`Document`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document).

- Get [`Document`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document) output from:
  - [Cloud Storage](https://cloud.google.com/storage)
  - [`BatchProcessMetadata`](https://cloud.google.com/document-ai/docs/reference/rest/Shared.Types/BatchProcessMetadata)
  - [`Operation` name](https://cloud.google.com/document-ai/docs/reference/rest/Shared.Types/ListOperationsResponse#Operation.FIELDS.name)

- Access text from [`Pages`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#page), [`Lines`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#line), [`Paragraphs`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#paragraph), [`FormFields`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#formfield), and [`Tables`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#table) without handling [`Layout`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#Layout) information.

- Search for [`Pages`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#page) containing a target string or matching a regular expression.

- Search for [`FormFields`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#formfield) by name.

- Search for [`Entities`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#entity) by type.

- Convert [`Tables`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#table) to a [Pandas](https://pandas.pydata.org/) Dataframe or CSV.

- Insert [`Entities`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#entity) and [`FormFields`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#formfield) into a [BigQuery](https://cloud.google.com/bigquery) table.

- Split a PDF file based on [output from a Splitter/Classifier processor]([#splitting](https://cloud.google.com/document-ai/docs/splitters)).

- Extract image [`Entities`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#entity) from [`Document`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document) [bounding boxes](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#boundingpoly).

- Convert [`Documents`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document) to and from commonly used formats:
  - [Cloud Vision API](https://cloud.google.com/vision) [`AnnotateFileResponse`](https://cloud.google.com/vision/docs/reference/rest/v1/BatchAnnotateFilesResponse#AnnotateFileResponse)
  - [hOCR](https://en.wikipedia.org/wiki/HOCR)
  - Third-party document processing formats

- Create batches of documents for processing from a [Cloud Storage](https://cloud.google.com/) folder.


In [None]:
%pip install --upgrade  --user -q google-cloud-documentai google-cloud-documentai-toolbox pandas

**Colab only:** Run the following cell to restart the kernel or use the restart button. For Vertex AI Workbench you can restart the terminal using the button on top.

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>

### Authenticating your notebook environment

* If you are using **Colab** to run this notebook, uncomment the cell below and continue.
* If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env).

In [None]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

In [None]:
# TODO(developer): Fill these variables before running the sample.
project_id = "YOUR_PROJECT_ID"  # @param {type:"string"}
# https://cloud.google.com/document-ai/docs/regions
location = "us"  # @param {type:"string"}

# Create processor before running sample
# https://cloud.google.com/document-ai/docs/create-processor
processor_id = "YOUR_PROCESSOR_ID"  # @param {type:"string"}
# https://cloud.google.com/document-ai/docs/manage-processor-versions
processor_version_id = "stable"  # @param {type:"string"}

# Format: `gs://bucket/directory/`
gcs_input_uri = "YOUR_INPUT_BUCKET"  # @param {type:"string"}
# Must end with a trailing slash `/`. Format: `gs://bucket/directory/subdirectory/`
gcs_output_uri = "YOUR_OUTPUT_BUCKET"  # @param {type:"string"}

batch_size = 1000
# Optional. The fields to return in the Document object.
field_mask = "text,entities,pages,shardInfo"  # @param {type:"string"}

In [None]:
# Set the project id
!gcloud config set project {project_id}
!gcloud auth application-default login -q

In [None]:
from IPython.display import display

from typing import List, Optional

# https://googleapis.dev/python/google-api-core/latest/client_options.html
from google.api_core.client_options import ClientOptions

# https://cloud.google.com/python/docs/reference/documentai/latest
from google.cloud import documentai

# https://cloud.google.com/document-ai/docs/toolbox
from google.cloud import documentai_toolbox

import pandas as pd

## Batch Processing

- Create batches of 1000 documents in Google Cloud Storage.
- Make a batch processing request for each batch.
- Get long-running operation ID for each request.

In [None]:
def batch_process_toolbox(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version_id: str,
    gcs_input_uri: str,
    gcs_output_uri: str,
    batch_size: int,
    field_mask: Optional[str] = None,
    skip_human_review: bool = True,
) -> List:
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version_id
    )

    # Cloud Storage URI for the Output Directory
    output_config = documentai.DocumentOutputConfig(
        gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
            gcs_uri=gcs_output_uri, field_mask=field_mask
        )
    )

    # Create batches of documents for processing
    # https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.utilities.gcs_utilities
    gcs_bucket_name, gcs_prefix = documentai_toolbox.gcs_utilities.split_gcs_uri(
        gcs_input_uri
    )
    batches = documentai_toolbox.gcs_utilities.create_batches(
        gcs_bucket_name, gcs_prefix, batch_size=batch_size
    )

    operations = []

    print(f"{len(batches)} batches created.")
    for batch in batches:
        print(f"{len(batch.gcs_documents.documents)} files in batch.")
        print(batch.gcs_documents.documents)

        # https://cloud.google.com/document-ai/docs/send-request?hl=en#async-processor
        # `batch_process_documents()` returns a Long Running Operation (LRO)
        operation = client.batch_process_documents(
            request=documentai.BatchProcessRequest(
                name=name,
                input_documents=batch,
                document_output_config=output_config,
                skip_human_review=skip_human_review,
            )
        )
        operations.append(operation)

    return operations

## Retrieve results once processing is complete

- Get output [`Document`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document) JSON files from `gcs_output_bucket` based on the Operation ID.

In [None]:
operations = batch_process_toolbox(
    project_id,
    location,
    processor_id,
    processor_version_id,
    gcs_input_uri,
    gcs_output_uri,
    batch_size,
    field_mask,
)

# Can do this asynchronously to avoid blocking
documents: List[documentai_toolbox.document.Document] = []

TIMEOUT = 60

for operation in operations:
    # https://cloud.google.com/document-ai/docs/long-running-operations
    print(f"Waiting for operation {operation.operation.name}")
    operation.result(timeout=TIMEOUT)
    documents.extend(
        documentai_toolbox.document.Document.from_batch_process_metadata(
            documentai.BatchProcessMetadata(operation.metadata)
        )
    )

## Print results

- Export extracted entities as dictionary
- Load into Pandas DataFrame
- Print DataFrame

In [None]:
for document in documents:
    # https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document#google_cloud_documentai_toolbox_wrappers_document_Document_entities_to_dict
    entities = document.entities_to_dict()
    # Optional: Export to BQ
    # job = document.entities_to_bigquery(dataset_name, table_name, project_id=project_id)

    df = pd.DataFrame([entities])

    display(df)