in workflow3_local/local_docsplitter.py [0:0]
def split_input_pdf_by_class(input_pdf_path, temp_dir_path, endpoint_arn, _id):
# loops through each page of the inputted multi-page PDF
# converts single-page PDF into an image and uploads it to the S3 bucket
# image in S3 is inputted into the Textract API; text is extracted, JSON is parsed
# raw text is inputted into the Comprehend model API using its endpoint ARN
# JSON response is parsed to find the predicted class
# the input PDF's page number is assigned to the predicted class in the pages_by_class dictionary
textract = boto3.client('textract')
comprehend = boto3.client('comprehend')
pages_by_class = {}
# converts PDF into images
image_paths = convert_from_path(
pdf_path=input_pdf_path,
fmt='jpeg',
paths_only=True,
output_folder=temp_dir_path
)
# process each image
for i, image_path in enumerate(image_paths):
textract_response = call_textract_on_image(textract, image_path)
raw_text = get_lines_string(textract_json=textract_response)
comprehend_response = call_comprehend(raw_text, comprehend, endpoint_arn)
_class = comprehend_response['Classes'][0]['Name']
add_page_to_class(i, _class, pages_by_class)
print("Input PDF has been split up and classified\n")
return pages_by_class