def split_document()

in document-processing-workflows/src/functions/split-document/main.py [0:0]


def split_document(request):
    """Triggered by Workflow.
    Args:
        request (flask.Request): The request object.
        <https://flask.palletsprojects.com/en/1.1.x/api/#incoming-request-data>
    Returns:
        The response text, or any set of values that can be turned into a
        Response object using `make_response`
        <https://flask.palletsprojects.com/en/1.1.x/api/#flask.make_response>.
    """
    request_json = request.get_json(silent=True)
    print(request_json)
    response = request_json
    storage_client = storage.Client()
    input_bucket = storage_client.get_bucket(request_json["resultBucket"])
    input_blob = input_bucket.get_blob(request_json["inputObject"])
    input_content = input_blob.download_as_bytes()
    process_result_bucket = storage_client.get_bucket(request_json["resultBucket"])
    process_result_blob = process_result_bucket.get_blob(request_json["resultObject"])
    process_result_content = process_result_blob.download_as_bytes()
    document = json.loads(process_result_content)
    response["classifications"] = []
    with io.BytesIO(input_content) as pdf_file:
        pdf = Pdf.open(pdf_file)
        for entity in document.get("entities", []):
            output = Pdf.new()
            start_page = min(
                page_ref["page"] for page_ref in entity["page_anchor"]["page_refs"]
            )
            end_page = max(
                page_ref["page"] for page_ref in entity["page_anchor"]["page_refs"]
            )
            for page_ref in entity["page_anchor"]["page_refs"]:
                output.pages.append(pdf.pages[page_ref["page"]])
            file_name = f"{Path(request_json['inputObject']).stem}_{start_page:03d}-{end_page:03d}_{entity['type']}_{uuid4()}.pdf"
            split_blob_name = f"{request_json['resultPrefix']}/{file_name}"
            print(f"Blob name: {split_blob_name}")
            split_blob = process_result_bucket.blob(blob_name=split_blob_name)
            byte_io = io.BytesIO()
            output.save(byte_io)
            split_blob.upload_from_file(
                byte_io, rewind=True, content_type="application/pdf"
            )
            classification = {
                "fileName": file_name,
                "objectName": split_blob_name,
                "pages": end_page - start_page + 1,
                "type": entity["type"],
            }
            response["classifications"].append(classification)
    return response