incubator-tools/language_detection/language_detection.ipynb (540 lines of code) (raw):

{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "72fd064f-24f5-4d61-b0ad-2b2f3fe9427d", "metadata": {}, "source": [ "# Language Detection of Document and Translation" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f5756f1a-631f-4c8a-bba0-98c6821d31a9", "metadata": {}, "source": [ "* Author: docai-incubator@google.com\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "9b1d12ef-55dd-4fbd-8389-db14ed038eb1", "metadata": {}, "source": [ "## Disclaimer\n", "\n", "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied." ] }, { "attachments": {}, "cell_type": "markdown", "id": "94527514-1ae2-470b-96e2-0f48e4aa5e81", "metadata": {}, "source": [ "## Purpose and Description" ] }, { "attachments": {}, "cell_type": "markdown", "id": "cdf4462a-fc0d-477f-9c39-4fec383ba4ca", "metadata": {}, "source": [ "This document guides you to use the translation API for detecting the language of the document and also to translate the document to the desired language." ] }, { "attachments": {}, "cell_type": "markdown", "id": "a8783f52-627b-4efa-b5d9-664ae2ca2564", "metadata": {}, "source": [ "## Prerequisites\n", "\n", "1. Vertex AI Notebook\n", "2. Documents in GCS Folder\n", "3. Output folder to upload translated documents\n", "4. Enable Translate API\n", "\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "55cc5540-deb1-4449-8278-716488c54e5c", "metadata": {}, "source": [ "## Functions to translate the documents and detect language" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d49f5b2d-f7fd-4403-a175-b95cc804f6ba", "metadata": {}, "source": [ "### 1. Detecting language from Doc AI parsed json( OCR text)" ] }, { "cell_type": "code", "execution_count": null, "id": "48983b9a", "metadata": {}, "outputs": [], "source": [ "!pip install google-cloud-translate" ] }, { "cell_type": "code", "execution_count": 1, "id": "62ef3a7a-e8ea-4045-8f54-ffe5dd4c4d16", "metadata": {}, "outputs": [], "source": [ "# Run this cell to download utilities module\n", "# !wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py" ] }, { "cell_type": "code", "execution_count": null, "id": "89120af5-c5f4-4897-a640-4ad9c5ce4739", "metadata": {}, "outputs": [], "source": [ "from utilities import documentai_json_proto_downloader\n", "from google.cloud import translate_v3beta1 as translate\n", "\n", "\n", "def sample_detect_language(project_id: str, content: str):\n", " \"\"\"This Function is used to detect the language using text\n", " Args:\n", " project_id: The GCP project ID.\n", " content: The text form of the document.\n", "\n", " Returns:\n", " The language of the document with the confidence score.\n", " \"\"\"\n", "\n", " location = \"us-central1\"\n", " # Create a client\n", " client = translate.TranslationServiceClient()\n", "\n", " # Initialize request argument(s)\n", " parent = client.common_location_path(project=project_id, location=location)\n", "\n", " request = translate.DetectLanguageRequest(\n", " content=content,\n", " parent=parent,\n", " )\n", " # Make the request\n", " response = client.detect_language(request=request)\n", "\n", " # Handle the response\n", " print(response)\n", "\n", "\n", "# Calling the functions#\n", "project_id = \"<PROJECT_ID>\"\n", "path = \"gs://xxxx/xxxxxxxx/xxxxx.json\"\n", "\n", "path_in_list = path.split(\"/\")\n", "bucket = path_in_list[2]\n", "prefix_file_path = \"/\".join(path_in_list[3:])\n", "\n", "json_dict = documentai_json_proto_downloader(bucket, prefix_file_path)\n", "content = json_dict.text\n", "sample_detect_language(project_id, content)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b02a0f30-9f93-46bf-b871-f34b4df08894", "metadata": {}, "source": [ "#### Output" ] }, { "attachments": {}, "cell_type": "markdown", "id": "678ebb32-9b15-4a98-a982-55eb7137a1f9", "metadata": {}, "source": [ "\n", "languages {\n", " language_code: \"de\"\n", " confidence: 1.0\n", "}\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "a6fa4328-3d99-4de4-a5eb-0f2033d78b79", "metadata": {}, "source": [ "### 2. Detecting language and Translating the document (Single document)" ] }, { "cell_type": "code", "execution_count": null, "id": "c7cf3f2d-f882-4461-b52b-9df0ad0e340d", "metadata": {}, "outputs": [], "source": [ "from google.cloud import translate_v3beta1 as translate\n", "\n", "\n", "def translate_document(\n", " project_id: str, gcs_input_path: str, gcs_output_path: str\n", ") -> translate.TranslationServiceClient:\n", " \"\"\"Translates a document.\n", "\n", " Args:\n", " project_id: The GCP project ID.\n", " file_path: The path to the file to be translated.\n", "\n", " Returns:\n", " The translated document.\n", " \"\"\"\n", "\n", " client = translate.TranslationServiceClient()\n", " location = \"us-central1\"\n", "\n", " parent = client.common_location_path(project=project_id, location=location)\n", "\n", " # Supported language codes: https://cloud.google.com/translate/docs/language\n", " gcs_source = translate.GcsSource(input_uri=gcs_input_path)\n", " document_input_config = translate.DocumentInputConfig(gcs_source=gcs_source)\n", " gcs_destination = translate.GcsDestination(output_uri_prefix=gcs_output_path)\n", " document_output_config = translate.DocumentOutputConfig(\n", " gcs_destination=gcs_destination\n", " )\n", "\n", " request = translate.TranslateDocumentRequest(\n", " parent=parent,\n", " target_language_code=\"en\",\n", " document_input_config=document_input_config,\n", " document_output_config=document_output_config,\n", " )\n", " response = client.translate_document(request)\n", " print(\n", " f\"Response: Detected Language Code - {response.document_translation.detected_language_code}\"\n", " )\n", "\n", " return response\n", "\n", "\n", "# Calling Function#\n", "project_id = \"<PROJECT_ID>\"\n", "gcs_input_path = \"gs://xxx/xxxxxx/xxxx.pdf\"\n", "gcs_output_path = \"gs://xxxx/xxxxxx/\"\n", "response = translate_document(project_id, gcs_input_path, gcs_output_path)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "7dca17b1-66ad-451f-84d6-93724b395b64", "metadata": {}, "source": [ "#### Output" ] }, { "attachments": {}, "cell_type": "markdown", "id": "5a58c073-6aa2-4c19-bf93-d3355a009ffd", "metadata": {}, "source": [ "Response: Detected Language Code - fr" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d5c5ca69-0c1e-4794-90d3-ea2212b4bc61", "metadata": {}, "source": [ "Translated document will also be saved in the gcs output path and file name (format) saved will be as per [doc](https://cloud.google.com/python/docs/reference/translate/latest/google.cloud.translate_v3beta1.types.DocumentOutputConfig)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "8b2c2216-4282-4433-9a74-cd503d874dad", "metadata": {}, "source": [ "### 3. Detecting language and Translating the documents ( Batch process)" ] }, { "cell_type": "code", "execution_count": null, "id": "55b31e47-492e-43b6-98e4-deb556cca31f", "metadata": {}, "outputs": [], "source": [ "from google.cloud import translate_v3beta1 as translate\n", "\n", "\n", "def batch_translate_document(\n", " input_uri: str,\n", " output_uri: str,\n", " project_id: str,\n", " source_language_code: str,\n", " target_language_codes: [\"en\"],\n", " timeout: int = 600,\n", ") -> translate.BatchTranslateDocumentResponse:\n", " \"\"\"Batch translate documents.\n", "\n", " Args:\n", " input_uri: Google Cloud Storage location of the input document.\n", " output_uri: Google Cloud Storage location of the output document.\n", " project_id: The GCP project ID.\n", " timeout: The timeout for this request.\n", "\n", " Returns:\n", " Translated document response\n", " \"\"\"\n", " client = translate.TranslationServiceClient()\n", "\n", " # The ``global`` location is not supported for batch translation\n", " location = \"us-central1\"\n", "\n", " # Google Cloud Storage location for the source input. This can be a single file\n", "\n", " parent = client.common_location_path(project=project_id, location=location)\n", "\n", " # Supported language codes: https://cloud.google.com/translate/docs/language\n", " gcs_source = translate.GcsSource(input_uri=input_uri)\n", " batch_document_input_configs = translate.BatchDocumentInputConfig(\n", " gcs_source=gcs_source\n", " )\n", " gcs_destination = translate.GcsDestination(output_uri_prefix=output_uri)\n", " batch_document_output_config = translate.BatchDocumentOutputConfig(\n", " gcs_destination=gcs_destination\n", " )\n", "\n", " request = translate.BatchTranslateDocumentRequest(\n", " parent=parent,\n", " source_language_code=source_language_code,\n", " target_language_codes=target_language_codes,\n", " input_configs=[batch_document_input_configs],\n", " output_config=batch_document_output_config,\n", " )\n", "\n", " operation = client.batch_translate_document(request=request)\n", "\n", " print(\"Waiting for operation to complete...\")\n", " response = operation.result(timeout)\n", "\n", " print(f\"Total Pages: {response.total_pages}\")\n", "\n", " return operation\n", "\n", "\n", "# example calling function#\n", "project_id = \"<PROJECT_ID>\"\n", "input_path = \"gs://xxxx/xxxxxxx/*\"\n", "output_path = \"gs://xxxx/xxxxxxxx/xxxx/\"\n", "operation = batch_translate_document(\n", " input_uri=input_path,\n", " output_uri=output_path,\n", " project_id=project_id,\n", " source_language_code=\"fr-FR\",\n", " target_language_codes=[\"en\", \"hi\"],\n", " timeout=600,\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "e554bd6d-c11d-4c7e-a45b-569454c0af40", "metadata": {}, "source": [ "The target language code can be multiple languages and the output translated documents are saved in the output path given and also with a ‘index.csv’ file which has file path and names from source and destination as well ." ] }, { "attachments": {}, "cell_type": "markdown", "id": "650e4974-1542-4d21-b18b-a3ffe1d291c4", "metadata": {}, "source": [ "<img src=\"./Images/language_detection_output_1.png\" width=800 height=400 alt=\"Language detection bucket Output image\">" ] }, { "attachments": {}, "cell_type": "markdown", "id": "4eb348cd-be1d-4e65-9984-e88b31f5611f", "metadata": {}, "source": [ "<img src=\"./Images/language_detection_output_2.png\" width=800 height=400 alt=\"Language detection CSV output image\">" ] }, { "attachments": {}, "cell_type": "markdown", "id": "e908ac4d-06fb-4aeb-9eac-c677b5a9f881", "metadata": {}, "source": [ "## Supported Languages and File types [Link](https://cloud.google.com/translate/docs/advanced/translate-documents)\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f9877297-ed6e-434a-a274-2bcb9b023300", "metadata": {}, "source": [ "<img src=\"./Images/language_detection_output_3.png\" width=800 height=400 alt=\"Language detection supported language image\">" ] }, { "attachments": {}, "cell_type": "markdown", "id": "5d2b1d63-aef5-4d66-b393-03670738f124", "metadata": {}, "source": [ "## Supported Languages [Link](https://cloud.google.com/translate/docs/advanced/discovering-supported-languages-v3)\n" ] }, { "attachments": {}, "cell_type": "markdown", "id": "700f9f28-c076-4b6f-ab8c-a0d99ce7a101", "metadata": {}, "source": [ "### 1. Function to get all the supported languages in the project" ] }, { "cell_type": "code", "execution_count": null, "id": "79c791be-7b0e-4339-93f1-60fbccf3667b", "metadata": {}, "outputs": [], "source": [ "from google.cloud import translate\n", "\n", "\n", "def get_supported_languages(\n", " project_id: str, # = \"YOUR_PROJECT_ID\",\n", ") -> translate.SupportedLanguages:\n", " \"\"\"Getting a list of supported language codes.\n", "\n", " Args:\n", " project_id: The GCP project ID.\n", "\n", " Returns:\n", " A list of supported language codes.\n", " \"\"\"\n", " client = translate.TranslationServiceClient()\n", "\n", " parent = client.common_project_path(project=project_id)\n", "\n", " # Supported language codes: https://cloud.google.com/translate/docs/languages\n", " response = client.get_supported_languages(parent=parent)\n", "\n", " # List language codes of supported languages.\n", " print(\"Supported Languages:\")\n", " for language in response.languages:\n", " print(f\"Language Code: {language.language_code}\")\n", "\n", " return response\n", "\n", "\n", "project_id = \"<PROJECT_ID>\"\n", "get_supported_languages(project_id)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "84356940-95a8-4489-bfc3-b85611f9558a", "metadata": {}, "source": [ "### 2. Function to get all the supported languages with target language" ] }, { "cell_type": "code", "execution_count": 4, "id": "09a1dbc0-5a31-4d48-af08-7b3a96331c7e", "metadata": {}, "outputs": [], "source": [ "from google.cloud import translate\n", "\n", "\n", "def get_supported_languages_with_target(\n", " project_id: str, # = \"YOUR_PROJECT_ID\"\n", ") -> translate.SupportedLanguages:\n", " \"\"\"Listing supported languages with target language name.\n", "\n", " Args:\n", " project_id: Your Google Cloud project ID.\n", "\n", " Returns:\n", " Supported languages.\n", " \"\"\"\n", " client = translate.TranslationServiceClient()\n", " parent = client.common_project_path(project=project_id)\n", "\n", " # Supported language codes: https://cloud.google.com/translate/docs/languages\n", " response = client.get_supported_languages(\n", " display_language_code=\"fr\", parent=parent # target language code\n", " )\n", " # List language codes of supported languages\n", " for language in response.languages:\n", " print(f\"Language Code: {language.language_code}\")\n", " print(f\"Display Name: {language.display_name}\")\n", "\n", " return response\n", "\n", "\n", "project_id = \"<PROJECT_ID>\"\n", "get_supported_languages_with_target(project_id)" ] } ], "metadata": { "environment": { "kernel": "python3", "name": "common-cpu.m104", "type": "gcloud", "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 5 }