in bq-connector/docai_bq_connector/doc_ai_processing/Processor.py [0:0]
def process(self) -> Union[DocumentOperation, ProcessedDocument]:
gcs_doc_blob, gcs_doc_meta = self._get_gcs_blob()
if self.content_type == CONTENT_TYPE_PDF:
# Original document. Needs to be processed by a DocAI extractor
page_count = get_pdf_page_cnt(gcs_doc_blob)
# Limit is different per processor: https://cloud.google.com/document-ai/quotas
if page_count <= self.max_sync_page_count:
process_result = self._process_sync(document_blob=gcs_doc_blob)
else:
process_result = self._process_async()
if (
isinstance(process_result, ProcessedDocument)
and process_result is not None
):
self._write_result_to_gcs(process_result.dictionary)
elif self.content_type == CONTENT_TYPE_JSON:
# This document was already processed and sent for HITL review. The result must now be processed
logging.debug(
f"Read DocAI HITL Output file = {self.bucket_name}/{self.file_name}"
)
process_result = self._process_hitl_output(gcs_doc_blob)
else:
logging.info(
f"Skipping non-supported file type {self.file_name} with content type = {self.content_type}"
)
return process_result