def extract_text_from_document()

in hacks/genai-intro/artifacts/function/main.py [0:0]


def extract_text_from_document(src_bucket: str, file_name: str, dst_bucket: str) -> str:
    """Extracts the contents of the PDF document and stores the results in a folder in GCS.

    In order to extract the contents of the PDF document OCR is applied and the results, 
    consisting of JSON files, are stored in the destination bucket in a folder that has 
    the same name as the source file name.

    Do not edit.

    Args:
        src_bucket: source bucket without the gs prefix, e.g. my-uploaded-docs-bucket
        file_name: source file name, e.g. my-file.pdf
        dst_bucket: destination bucket without the gs prefix, e.g. my-staging-bucket

    Returns:
        destination folder, name of the folder in the staging bucket where the JSON 
        files are stored for the PDF document
    """
    src_uri = f"gs://{src_bucket}/{file_name}"
    dst_uri = f"gs://{dst_bucket}/{file_name}/"
    mime_type = "application/pdf"
    batch_size = 2

    # Perform Vision OCR
    client = vision.ImageAnnotatorClient()
    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=src_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=dst_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size
    )

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config, output_config=output_config
    )

    operation = client.async_batch_annotate_files(requests=[async_request])

    operation.result(timeout=420)

    return f"{file_name}/"