in workflow3_local/local_endpointbuilder.py [0:0]
def add_image_to_csv(item, temp_dir_path):
i = item[0]
image_info = item[1]
_class = image_info[1]
if len(image_info) == 2:
# extract text from an image that was originally an image
rel_path = image_info[0]
image_path = f"{temp_dir_path}/local_dataset_docs/{rel_path}"
textract_json = call_textract_on_image(image_path)
raw_text = get_lines_string(textract_json=textract_json)
elif len(image_info) == 3:
# extract text from PDF that has been converted into one or more images
file_name = image_info[0]
pdf_num_pages = image_info[2]
# raw_text will store text extracted from all of the images that compose the PDF
raw_text = ""
image_path_start = f"{temp_dir_path}/images_processed/{file_name}"
for page_num in range(pdf_num_pages):
path = f"{image_path_start}-img-{page_num}.jpg"
textract_json = call_textract_on_image(path)
# parse JSON response to get raw text
image_text = get_lines_string(textract_json=textract_json)
raw_text += image_text
# row is returned; contains document's class and the text within it
print(f"Created row {i + 1}")
return [_class, raw_text]