def add_image_to_csv()

in workflow3_local/local_endpointbuilder.py [0:0]


def add_image_to_csv(item, temp_dir_path):
    i = item[0]
    image_info = item[1]
    _class = image_info[1]

    if len(image_info) == 2:
        # extract text from an image that was originally an image
        rel_path = image_info[0]
        image_path = f"{temp_dir_path}/local_dataset_docs/{rel_path}"
        textract_json = call_textract_on_image(image_path)
        raw_text = get_lines_string(textract_json=textract_json)

    elif len(image_info) == 3:
        # extract text from PDF that has been converted into one or more images
        file_name = image_info[0]
        pdf_num_pages = image_info[2]
        # raw_text will store text extracted from all of the images that compose the PDF
        raw_text = ""
        image_path_start = f"{temp_dir_path}/images_processed/{file_name}"

        for page_num in range(pdf_num_pages):
            path = f"{image_path_start}-img-{page_num}.jpg"
            textract_json = call_textract_on_image(path)
            # parse JSON response to get raw text
            image_text = get_lines_string(textract_json=textract_json)
            raw_text += image_text

    # row is returned; contains document's class and the text within it
    print(f"Created row {i + 1}")
    return [_class, raw_text]