def split_input_pdf_by_class()

in workflow3_local/local_docsplitter.py [0:0]


def split_input_pdf_by_class(input_pdf_path, temp_dir_path, endpoint_arn, _id):
    # loops through each page of the inputted multi-page PDF
    # converts single-page PDF into an image and uploads it to the S3 bucket
    # image in S3 is inputted into the Textract API; text is extracted, JSON is parsed
    # raw text is inputted into the Comprehend model API using its endpoint ARN
    # JSON response is parsed to find the predicted class
    # the input PDF's page number is assigned to the predicted class in the pages_by_class dictionary
    textract = boto3.client('textract')
    comprehend = boto3.client('comprehend')
    pages_by_class = {}

    # converts PDF into images
    image_paths = convert_from_path(
        pdf_path=input_pdf_path,
        fmt='jpeg',
        paths_only=True,
        output_folder=temp_dir_path
    )

    # process each image
    for i, image_path in enumerate(image_paths):
        textract_response = call_textract_on_image(textract, image_path)
        raw_text = get_lines_string(textract_json=textract_response)
        comprehend_response = call_comprehend(raw_text, comprehend, endpoint_arn)
        _class = comprehend_response['Classes'][0]['Name']
        add_page_to_class(i, _class, pages_by_class)

    print("Input PDF has been split up and classified\n")
    return pages_by_class