filter-hitl-language/docai_utils.py (41 lines of code) (raw):
"""
Document AI Functions
"""
from collections import defaultdict
from typing import Any
from gcs_utils import create_bucket
from gcs_utils import get_all_buckets
from gcs_utils import get_files_from_gcs
from gcs_utils import move_file
from google.cloud import documentai_v1 as documentai
UNDEFINED_LANGUAGE = "und"
def sort_document_files_by_language(
gcs_input_bucket: str, gcs_input_prefix: str, gcs_output_bucket: str
) -> None:
"""
Move files between buckets based on language
"""
blobs = get_files_from_gcs(gcs_input_bucket, gcs_input_prefix)
buckets = get_all_buckets()
# Output Document.json Files
for blob in blobs:
if ".json" not in blob.name:
print(f"Skipping non-supported file type {blob.name}")
continue
print(f"Downloading {blob.name}")
document = documentai.types.Document.from_json(
blob.download_as_bytes(), ignore_unknown_fields=True
)
# Find the most frequent language in the document
predominant_language = get_most_frequent_language(document)
print(f"Predominant Language: {predominant_language}")
# Create the output bucket if it does not exist
language_bucket_name = f"{gcs_output_bucket}{predominant_language}"
if language_bucket_name not in buckets:
print(f"Creating bucket {language_bucket_name}")
create_bucket(language_bucket_name)
buckets.add(language_bucket_name)
# Move Document.json file to bucket based on language
move_file(gcs_input_bucket, blob.name, language_bucket_name)
def get_most_frequent_language(document: documentai.Document) -> str:
"""
Returns the most frequent language in the document
"""
language_frequency: defaultdict[Any, int] = defaultdict(int)
for page in document.pages:
for language in page.detected_languages:
if language.language_code == UNDEFINED_LANGUAGE or (
language.confidence and language.confidence < 0.5
):
continue
language_frequency[language.language_code] += 1
return max(
language_frequency, key=language_frequency.get, default=UNDEFINED_LANGUAGE # type: ignore
)