def prepare_data_for_batch_inference()

in cli/foundation-models/system/inference/image-text-embeddings/prepare_data.py [0:0]


def prepare_data_for_batch_inference(dataset_dir: str) -> None:
    """Prepare image folder and csv files for batch inference.

    This function will move all images to a single image folder and also create folders of csv
    files. Each folder will have csv files that contain images in base64 format, text samples, or both.
    :param dataset_dir: dataset directory
    :type dataset_dir: str
    """
    batch_input_file = "batch_input.csv"
    # Generate batch input for image embeddings
    image_list = []
    image_path_list = []

    for dir_name in os.listdir(dataset_dir):
        dir_path = os.path.join(dataset_dir, dir_name)
        for path, _, files in os.walk(dir_path):
            for file in files:
                image_path = os.path.join(path, file)
                image = read_image(image_path)
                image_path_list.append(image_path)
                image_list.append(base64.encodebytes(image).decode("utf-8"))

    image_data = [[image, ""] for image in image_list]
    batch_df = pd.DataFrame(image_data, columns=["image", "text"])

    image_csv_folder_path = os.path.join(dataset_dir, "image_batch")
    os.makedirs(image_csv_folder_path, exist_ok=True)
    # Divide this into files of 10 rows each
    batch_size_per_predict = 10
    for i in range(0, len(batch_df), batch_size_per_predict):
        j = i + batch_size_per_predict
        batch_df[i:j].to_csv(
            os.path.join(image_csv_folder_path, str(i) + batch_input_file)
        )

    # Generate batch input for text embeddings
    # supply strings describing the images
    text_data = [
        ["", "a photo of a " + os.path.basename(os.path.dirname(image_path))]
        for image_path in image_path_list
    ]
    batch_df = pd.DataFrame(text_data, columns=["image", "text"])

    text_csv_folder_path = os.path.join(dataset_dir, "text_batch")
    os.makedirs(text_csv_folder_path, exist_ok=True)
    # Divide this into files of 10 rows each
    batch_size_per_predict = 10
    for i in range(0, len(batch_df), batch_size_per_predict):
        j = i + batch_size_per_predict
        batch_df[i:j].to_csv(
            os.path.join(text_csv_folder_path, str(i) + batch_input_file)
        )

    # Generate batch input for image and text embeddings
    # supply base64 images for images samples and random strings for text samples
    image_text_data = [
        [image_list[i], "a photo of a " + os.path.basename(os.path.dirname(image_path))]
        for i in range(len(image_list))
    ]
    batch_df = pd.DataFrame(image_text_data, columns=["image", "text"])

    image_text_csv_folder_path = os.path.join(dataset_dir, "image_text_batch")
    os.makedirs(image_text_csv_folder_path, exist_ok=True)
    # Divide this into files of 10 rows each
    batch_size_per_predict = 10
    for i in range(0, len(batch_df), batch_size_per_predict):
        j = i + batch_size_per_predict
        batch_df[i:j].to_csv(
            os.path.join(image_text_csv_folder_path, str(i) + batch_input_file)
        )