in workflow3_local/local_endpointbuilder.py [0:0]
def get_processed_images(temp_dir_path):
valid_file_extensions = {"pdf", "jpg", "png"}
# file_info is a list of tuples with 2 elements: (file_path, file_extension)
file_info = []
# loop through all files in local_dataset_docs
for root, dirs, files in os.walk(temp_dir_path + "/local_dataset_docs"):
for file in files:
file_path = os.path.join(root, file)
file_type = filetype.guess(file_path)
if file_type is None:
# filter out folders
continue
file_extension = file_type.extension
if file_extension in valid_file_extensions:
# only accept PDF, JPEG, and PNG files
file_info.append((file_path, file_extension))
with ThreadPoolExecutor() as executor:
# new_image_info is a generator of tuples
# if the file is an image, the tuple is (rel_path, _class)
# if the file is a PDF, the tuple is (file_name, _class, pdf_num_pages)
new_image_info = executor.map(process_file, file_info, repeat(temp_dir_path))
return list(new_image_info)