in classify-split-extract-workflow/classify-job/split_and_classify.py [0:0]
def split_pdf(gcs_uri: str, entities: List[Document.Entity]) -> Dict:
"""Splits local PDF file into multiple PDF files based on output from a
Splitter/Classifier processor.
Args:
gcs_uri (str):
Required. The path to the PDF file.
entities (List[Document.Entity]):
Required. The list of entities to be split.
Returns:
List[str]:
A list of output pdf files.
"""
documents: Dict = {}
if len(entities) == 1:
metadata = get_metadata(entities[0])
metadata.update({"original": gcs_uri})
gcs_helper.add_metadata(gcs_uri=gcs_uri, metadata=metadata)
add_predicted_document_type(
metadata=metadata, input_gcs_source=gcs_uri, documents=documents
)
else:
temp_local_dir = os.path.join(
os.path.dirname(__file__), "temp_files", utils.get_utc_timestamp()
)
if not os.path.exists(temp_local_dir):
os.makedirs(temp_local_dir)
pdf_path = os.path.join(temp_local_dir, os.path.basename(gcs_uri))
gcs_helper.download_file(gcs_uri=gcs_uri, output_filename=pdf_path)
input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
bucket_name, _ = gcs_utilities.split_gcs_uri(gcs_uri)
with Pdf.open(pdf_path) as pdf:
for entity in entities:
subdoc_type = entity.type_ or "subdoc"
page_refs = entity.page_anchor.page_refs
if page_refs:
start_page = int(page_refs[0].page)
end_page = int(page_refs[-1].page)
else:
logger.warning(
f"Skipping {pdf_path} entity due to no page refs, no splitting"
)
continue
page_range = (
f"pg{start_page + 1}"
if start_page == end_page
else f"pg{start_page + 1}-{end_page + 1}"
)
output_filename = (
f"{input_filename}_{page_range}_{subdoc_type}{input_extension}"
)
metadata = get_metadata(entity)
metadata.update({"original": gcs_uri})
gcs_path = gcs_utilities.split_gcs_uri(os.path.dirname(gcs_uri))[1]
destination_blob_name = os.path.join(
gcs_path, SPLITTER_OUTPUT_DIR, output_filename
)
destination_blob_uri = f"gs://{bucket_name}/{destination_blob_name}"
local_out_file = os.path.join(temp_local_dir, output_filename)
subdoc = Pdf.new()
subdoc.pages.extend(pdf.pages[start_page : end_page + 1])
subdoc.save(local_out_file, min_version=pdf.pdf_version)
gcs_helper.upload_file(
bucket_name=bucket_name,
source_file_name=local_out_file,
destination_blob_name=destination_blob_name,
)
gcs_helper.add_metadata(destination_blob_uri, metadata)
add_predicted_document_type(
metadata=metadata,
input_gcs_source=destination_blob_uri,
documents=documents,
)
utils.delete_directory(temp_local_dir)
return documents