in workflow3_local/local_docsplitter.py [0:0]
def main(endpoint_arn, choice, input_pdf_info):
root_path = os.path.dirname(os.path.abspath(__file__))
_id = datetime.now().strftime("%Y%m%d%H%M%S")
temp_dir_name = f"workflow2_temp_documents-{_id}"
temp_dir_path = f"{root_path}/{temp_dir_name}"
output_dir_name = f"workflow2_output_documents-{_id}"
output_dir_path = f"{root_path}/{output_dir_name}"
create_directories([temp_dir_path, output_dir_path])
s3 = boto3.client('s3')
if choice == "s3":
input_pdf_uri = input_pdf_info
bucket_name = input_pdf_uri.split("/")[2]
input_pdf_key = input_pdf_uri.split(bucket_name + "/", 1)[1]
input_pdf_path = f"{temp_dir_path}/input.pdf"
with open(input_pdf_path, "wb") as data:
s3.download_fileobj(bucket_name, input_pdf_key, data)
elif choice == "local":
input_pdf_path = input_pdf_info
# pages_by_class is a dictionary
# key is class name; value is list of page numbers belonging to the key class
pages_by_class = split_input_pdf_by_class(input_pdf_path, temp_dir_path, endpoint_arn, _id)
create_output_pdfs(input_pdf_path, pages_by_class, output_dir_path, output_dir_name)
rmtree(temp_dir_path)
print("Multi-class PDFs have been created in the output folder, " + output_dir_path)