in hacks/genai-intro/artifacts/function/main.py [0:0]
def extract_text_from_document(src_bucket: str, file_name: str, dst_bucket: str) -> str:
"""Extracts the contents of the PDF document and stores the results in a folder in GCS.
In order to extract the contents of the PDF document OCR is applied and the results,
consisting of JSON files, are stored in the destination bucket in a folder that has
the same name as the source file name.
Do not edit.
Args:
src_bucket: source bucket without the gs prefix, e.g. my-uploaded-docs-bucket
file_name: source file name, e.g. my-file.pdf
dst_bucket: destination bucket without the gs prefix, e.g. my-staging-bucket
Returns:
destination folder, name of the folder in the staging bucket where the JSON
files are stored for the PDF document
"""
src_uri = f"gs://{src_bucket}/{file_name}"
dst_uri = f"gs://{dst_bucket}/{file_name}/"
mime_type = "application/pdf"
batch_size = 2
# Perform Vision OCR
client = vision.ImageAnnotatorClient()
feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
gcs_source = vision.GcsSource(uri=src_uri)
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
gcs_destination = vision.GcsDestination(uri=dst_uri)
output_config = vision.OutputConfig(
gcs_destination=gcs_destination, batch_size=batch_size
)
async_request = vision.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config, output_config=output_config
)
operation = client.async_batch_annotate_files(requests=[async_request])
operation.result(timeout=420)
return f"{file_name}/"