# Language Detection of Document and Translation

* Author: docai-incubator@google.com


## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.

## Purpose and Description

This document guides you to use the translation API for detecting the language of the document and also to translate the document to the desired language.

## Prerequisites

1. Vertex AI Notebook
2. Documents in GCS Folder
3. Output folder to upload translated documents
4. Enable Translate API



## Functions to translate the documents and detect language

### 1. Detecting language from Doc AI parsed json( OCR text)

In [None]:
!pip install google-cloud-translate

In [1]:
# Run this cell to download utilities module
# !wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
from utilities import documentai_json_proto_downloader
from google.cloud import translate_v3beta1 as translate


def sample_detect_language(project_id: str, content: str):
    """This Function is used to detect the language using text
    Args:
        project_id: The GCP project ID.
        content: The text form of the document.

    Returns:
        The language of the document with the confidence score.
    """

    location = "us-central1"
    # Create a client
    client = translate.TranslationServiceClient()

    # Initialize request argument(s)
    parent = client.common_location_path(project=project_id, location=location)

    request = translate.DetectLanguageRequest(
        content=content,
        parent=parent,
    )
    # Make the request
    response = client.detect_language(request=request)

    # Handle the response
    print(response)


# Calling the functions#
project_id = "<PROJECT_ID>"
path = "gs://xxxx/xxxxxxxx/xxxxx.json"

path_in_list = path.split("/")
bucket = path_in_list[2]
prefix_file_path = "/".join(path_in_list[3:])

json_dict = documentai_json_proto_downloader(bucket, prefix_file_path)
content = json_dict.text
sample_detect_language(project_id, content)

#### Output


languages {
  language_code: "de"
  confidence: 1.0
}


### 2. Detecting language and Translating the document (Single document)

In [None]:
from google.cloud import translate_v3beta1 as translate


def translate_document(
    project_id: str, gcs_input_path: str, gcs_output_path: str
) -> translate.TranslationServiceClient:
    """Translates a document.

    Args:
        project_id: The GCP project ID.
        file_path: The path to the file to be translated.

    Returns:
        The translated document.
    """

    client = translate.TranslationServiceClient()
    location = "us-central1"

    parent = client.common_location_path(project=project_id, location=location)

    # Supported language codes: https://cloud.google.com/translate/docs/language
    gcs_source = translate.GcsSource(input_uri=gcs_input_path)
    document_input_config = translate.DocumentInputConfig(gcs_source=gcs_source)
    gcs_destination = translate.GcsDestination(output_uri_prefix=gcs_output_path)
    document_output_config = translate.DocumentOutputConfig(
        gcs_destination=gcs_destination
    )

    request = translate.TranslateDocumentRequest(
        parent=parent,
        target_language_code="en",
        document_input_config=document_input_config,
        document_output_config=document_output_config,
    )
    response = client.translate_document(request)
    print(
        f"Response: Detected Language Code - {response.document_translation.detected_language_code}"
    )

    return response


# Calling Function#
project_id = "<PROJECT_ID>"
gcs_input_path = "gs://xxx/xxxxxx/xxxx.pdf"
gcs_output_path = "gs://xxxx/xxxxxx/"
response = translate_document(project_id, gcs_input_path, gcs_output_path)

#### Output

Response: Detected Language Code - fr

Translated document will also be saved in the gcs output path  and file name (format) saved will be as per [doc](https://cloud.google.com/python/docs/reference/translate/latest/google.cloud.translate_v3beta1.types.DocumentOutputConfig)

### 3. Detecting language and Translating the documents ( Batch process)

In [None]:
from google.cloud import translate_v3beta1 as translate


def batch_translate_document(
    input_uri: str,
    output_uri: str,
    project_id: str,
    source_language_code: str,
    target_language_codes: ["en"],
    timeout: int = 600,
) -> translate.BatchTranslateDocumentResponse:
    """Batch translate documents.

    Args:
        input_uri: Google Cloud Storage location of the input document.
        output_uri: Google Cloud Storage location of the output document.
        project_id: The GCP project ID.
        timeout: The timeout for this request.

    Returns:
        Translated document response
    """
    client = translate.TranslationServiceClient()

    # The ``global`` location is not supported for batch translation
    location = "us-central1"

    # Google Cloud Storage location for the source input. This can be a single file

    parent = client.common_location_path(project=project_id, location=location)

    # Supported language codes: https://cloud.google.com/translate/docs/language
    gcs_source = translate.GcsSource(input_uri=input_uri)
    batch_document_input_configs = translate.BatchDocumentInputConfig(
        gcs_source=gcs_source
    )
    gcs_destination = translate.GcsDestination(output_uri_prefix=output_uri)
    batch_document_output_config = translate.BatchDocumentOutputConfig(
        gcs_destination=gcs_destination
    )

    request = translate.BatchTranslateDocumentRequest(
        parent=parent,
        source_language_code=source_language_code,
        target_language_codes=target_language_codes,
        input_configs=[batch_document_input_configs],
        output_config=batch_document_output_config,
    )

    operation = client.batch_translate_document(request=request)

    print("Waiting for operation to complete...")
    response = operation.result(timeout)

    print(f"Total Pages: {response.total_pages}")

    return operation


# example calling function#
project_id = "<PROJECT_ID>"
input_path = "gs://xxxx/xxxxxxx/*"
output_path = "gs://xxxx/xxxxxxxx/xxxx/"
operation = batch_translate_document(
    input_uri=input_path,
    output_uri=output_path,
    project_id=project_id,
    source_language_code="fr-FR",
    target_language_codes=["en", "hi"],
    timeout=600,
)

The target language code can be multiple languages and the output translated documents are saved in the output path given and also with a ‘index.csv’ file which has file path and names from source and destination as well .

<img src="./Images/language_detection_output_1.png" width=800 height=400 alt="Language detection bucket Output image">

<img src="./Images/language_detection_output_2.png" width=800 height=400 alt="Language detection CSV output image">

## Supported Languages and File types [Link](https://cloud.google.com/translate/docs/advanced/translate-documents)


<img src="./Images/language_detection_output_3.png" width=800 height=400 alt="Language detection supported language image">

## Supported Languages  [Link](https://cloud.google.com/translate/docs/advanced/discovering-supported-languages-v3)


### 1. Function to get all the supported languages in the project

In [None]:
from google.cloud import translate


def get_supported_languages(
    project_id: str,  # = "YOUR_PROJECT_ID",
) -> translate.SupportedLanguages:
    """Getting a list of supported language codes.

    Args:
        project_id: The GCP project ID.

    Returns:
        A list of supported language codes.
    """
    client = translate.TranslationServiceClient()

    parent = client.common_project_path(project=project_id)

    # Supported language codes: https://cloud.google.com/translate/docs/languages
    response = client.get_supported_languages(parent=parent)

    # List language codes of supported languages.
    print("Supported Languages:")
    for language in response.languages:
        print(f"Language Code: {language.language_code}")

    return response


project_id = "<PROJECT_ID>"
get_supported_languages(project_id)

### 2. Function to get all the supported languages with target language

In [4]:
from google.cloud import translate


def get_supported_languages_with_target(
    project_id: str,  # = "YOUR_PROJECT_ID"
) -> translate.SupportedLanguages:
    """Listing supported languages with target language name.

    Args:
        project_id: Your Google Cloud project ID.

    Returns:
        Supported languages.
    """
    client = translate.TranslationServiceClient()
    parent = client.common_project_path(project=project_id)

    # Supported language codes: https://cloud.google.com/translate/docs/languages
    response = client.get_supported_languages(
        display_language_code="fr", parent=parent  # target language code
    )
    # List language codes of supported languages
    for language in response.languages:
        print(f"Language Code: {language.language_code}")
        print(f"Display Name: {language.display_name}")

    return response


project_id = "<PROJECT_ID>"
get_supported_languages_with_target(project_id)