in hacks/genai-intro/artifacts/function/main.py [0:0]
def collate_pages(bucket: str, folder: str) -> str:
"""Collates all pages, stored as JSON files in the provided bucket & folder,
parses them, extracts the relevant parts and concatenates them into a single string.
Do not edit.
Args:
bucket: bucket without the gs prefix, e.g. my-staging-bucket
folder: folder name, e.g. my-file/
Returns:
complete text of the PDF document as a single string in regular text format
"""
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.get_bucket(bucket)
blob_list = [blob for blob in list(bucket.list_blobs(prefix=folder))]
complete_text = ""
for output in blob_list:
json_string = output.download_as_bytes().decode("utf-8")
response = json.loads(json_string)
for page in response["responses"]:
complete_text += page["fullTextAnnotation"]["text"]
return complete_text