def collate_pages()

in hacks/genai-intro/artifacts/function/main.py [0:0]


def collate_pages(bucket: str, folder: str) -> str:
    """Collates all pages, stored as JSON files in the provided bucket & folder, 
    parses them, extracts the relevant parts and concatenates them into a single string.

    Do not edit.

    Args:
        bucket: bucket without the gs prefix, e.g. my-staging-bucket
        folder: folder name, e.g. my-file/

    Returns:
        complete text of the PDF document as a single string in regular text format
    """
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.get_bucket(bucket)
    blob_list = [blob for blob in list(bucket.list_blobs(prefix=folder))]

    complete_text = ""
    for output in blob_list:
        json_string = output.download_as_bytes().decode("utf-8")
        response = json.loads(json_string)
        for page in response["responses"]:
            complete_text += page["fullTextAnnotation"]["text"]

    return complete_text