def get_processed_images()

in workflow3_local/local_endpointbuilder.py [0:0]


def get_processed_images(temp_dir_path):
    valid_file_extensions = {"pdf", "jpg", "png"}
    # file_info is a list of tuples with 2 elements: (file_path, file_extension)
    file_info = []

    # loop through all files in local_dataset_docs
    for root, dirs, files in os.walk(temp_dir_path + "/local_dataset_docs"):
        for file in files:
            file_path = os.path.join(root, file)
            file_type = filetype.guess(file_path)
            if file_type is None:
                # filter out folders
                continue
            file_extension = file_type.extension
            if file_extension in valid_file_extensions:
                # only accept PDF, JPEG, and PNG files
                file_info.append((file_path, file_extension))

    with ThreadPoolExecutor() as executor:
        # new_image_info is a generator of tuples
        # if the file is an image, the tuple is (rel_path, _class)
        # if the file is a PDF, the tuple is (file_name, _class, pdf_num_pages)
        new_image_info = executor.map(process_file, file_info, repeat(temp_dir_path))
    return list(new_image_info)