filter-hitl-language/docai_utils.py (41 lines of code) (raw):

""" Document AI Functions """ from collections import defaultdict from typing import Any from gcs_utils import create_bucket from gcs_utils import get_all_buckets from gcs_utils import get_files_from_gcs from gcs_utils import move_file from google.cloud import documentai_v1 as documentai UNDEFINED_LANGUAGE = "und" def sort_document_files_by_language( gcs_input_bucket: str, gcs_input_prefix: str, gcs_output_bucket: str ) -> None: """ Move files between buckets based on language """ blobs = get_files_from_gcs(gcs_input_bucket, gcs_input_prefix) buckets = get_all_buckets() # Output Document.json Files for blob in blobs: if ".json" not in blob.name: print(f"Skipping non-supported file type {blob.name}") continue print(f"Downloading {blob.name}") document = documentai.types.Document.from_json( blob.download_as_bytes(), ignore_unknown_fields=True ) # Find the most frequent language in the document predominant_language = get_most_frequent_language(document) print(f"Predominant Language: {predominant_language}") # Create the output bucket if it does not exist language_bucket_name = f"{gcs_output_bucket}{predominant_language}" if language_bucket_name not in buckets: print(f"Creating bucket {language_bucket_name}") create_bucket(language_bucket_name) buckets.add(language_bucket_name) # Move Document.json file to bucket based on language move_file(gcs_input_bucket, blob.name, language_bucket_name) def get_most_frequent_language(document: documentai.Document) -> str: """ Returns the most frequent language in the document """ language_frequency: defaultdict[Any, int] = defaultdict(int) for page in document.pages: for language in page.detected_languages: if language.language_code == UNDEFINED_LANGUAGE or ( language.confidence and language.confidence < 0.5 ): continue language_frequency[language.language_code] += 1 return max( language_frequency, key=language_frequency.get, default=UNDEFINED_LANGUAGE # type: ignore )