in webhook/document_extract.py [0:0]
def get_ocr_output_from_bucket(gcs_destination_uri: str, bucket_name: str) -> str:
"""Iterates over blobs in output bucket to get full OCR result.
Arguments:
gcs_destination_uri: the URI where the OCR output was saved.
bucket_name: the name of the bucket where the output was saved.
Returns:
The full text of the document
"""
storage_client = storage.Client()
match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
prefix = match.group(2)
bucket = storage_client.get_bucket(bucket_name)
# List objects with the given prefix, filtering out folders.
blob_list = [
blob
for blob in list(bucket.list_blobs(prefix=prefix))
if not blob.name.endswith("/")
]
# Concatenate all text from the blobs
complete_text = ""
for output in blob_list:
json_string = output.download_as_bytes().decode("utf-8")
response = json.loads(json_string)
# The actual response for the first page of the input file.
page_response = response["responses"][0]
annotation = page_response["fullTextAnnotation"]
complete_text = complete_text + annotation["text"]
return complete_text