def get_ocr_output_from_bucket()

in webhook/document_extract.py [0:0]


def get_ocr_output_from_bucket(gcs_destination_uri: str, bucket_name: str) -> str:
    """Iterates over blobs in output bucket to get full OCR result.

    Arguments:
        gcs_destination_uri: the URI where the OCR output was saved.
        bucket_name: the name of the bucket where the output was saved.

    Returns:
        The full text of the document
    """
    storage_client = storage.Client()

    match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
    prefix = match.group(2)
    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix, filtering out folders.
    blob_list = [
        blob
        for blob in list(bucket.list_blobs(prefix=prefix))
        if not blob.name.endswith("/")
    ]

    # Concatenate all text from the blobs
    complete_text = ""
    for output in blob_list:
        json_string = output.download_as_bytes().decode("utf-8")
        response = json.loads(json_string)

        # The actual response for the first page of the input file.
        page_response = response["responses"][0]
        annotation = page_response["fullTextAnnotation"]

        complete_text = complete_text + annotation["text"]

    return complete_text