def split_pdf()

in classify-split-extract-workflow/classify-job/split_and_classify.py [0:0]


def split_pdf(gcs_uri: str, entities: List[Document.Entity]) -> Dict:
    """Splits local PDF file into multiple PDF files based on output from a
    Splitter/Classifier processor.

    Args:
      gcs_uri (str):
          Required. The path to the PDF file.
      entities (List[Document.Entity]):
          Required. The list of entities to be split.
    Returns:
      List[str]:
          A list of output pdf files.
    """

    documents: Dict = {}

    if len(entities) == 1:
        metadata = get_metadata(entities[0])
        metadata.update({"original": gcs_uri})

        gcs_helper.add_metadata(gcs_uri=gcs_uri, metadata=metadata)
        add_predicted_document_type(
            metadata=metadata, input_gcs_source=gcs_uri, documents=documents
        )
    else:
        temp_local_dir = os.path.join(
            os.path.dirname(__file__), "temp_files", utils.get_utc_timestamp()
        )
        if not os.path.exists(temp_local_dir):
            os.makedirs(temp_local_dir)

        pdf_path = os.path.join(temp_local_dir, os.path.basename(gcs_uri))
        gcs_helper.download_file(gcs_uri=gcs_uri, output_filename=pdf_path)

        input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
        bucket_name, _ = gcs_utilities.split_gcs_uri(gcs_uri)

        with Pdf.open(pdf_path) as pdf:
            for entity in entities:
                subdoc_type = entity.type_ or "subdoc"
                page_refs = entity.page_anchor.page_refs
                if page_refs:
                    start_page = int(page_refs[0].page)
                    end_page = int(page_refs[-1].page)
                else:
                    logger.warning(
                        f"Skipping {pdf_path} entity due to no page refs, no splitting"
                    )
                    continue
                page_range = (
                    f"pg{start_page + 1}"
                    if start_page == end_page
                    else f"pg{start_page + 1}-{end_page + 1}"
                )
                output_filename = (
                    f"{input_filename}_{page_range}_{subdoc_type}{input_extension}"
                )
                metadata = get_metadata(entity)
                metadata.update({"original": gcs_uri})

                gcs_path = gcs_utilities.split_gcs_uri(os.path.dirname(gcs_uri))[1]
                destination_blob_name = os.path.join(
                    gcs_path, SPLITTER_OUTPUT_DIR, output_filename
                )
                destination_blob_uri = f"gs://{bucket_name}/{destination_blob_name}"

                local_out_file = os.path.join(temp_local_dir, output_filename)

                subdoc = Pdf.new()
                subdoc.pages.extend(pdf.pages[start_page : end_page + 1])
                subdoc.save(local_out_file, min_version=pdf.pdf_version)

                gcs_helper.upload_file(
                    bucket_name=bucket_name,
                    source_file_name=local_out_file,
                    destination_blob_name=destination_blob_name,
                )
                gcs_helper.add_metadata(destination_blob_uri, metadata)

                add_predicted_document_type(
                    metadata=metadata,
                    input_gcs_source=destination_blob_uri,
                    documents=documents,
                )

        utils.delete_directory(temp_local_dir)
    return documents