in cli/foundation-models/system/inference/image-text-embeddings/prepare_data.py [0:0]
def prepare_data_for_batch_inference(dataset_dir: str) -> None:
"""Prepare image folder and csv files for batch inference.
This function will move all images to a single image folder and also create folders of csv
files. Each folder will have csv files that contain images in base64 format, text samples, or both.
:param dataset_dir: dataset directory
:type dataset_dir: str
"""
batch_input_file = "batch_input.csv"
# Generate batch input for image embeddings
image_list = []
image_path_list = []
for dir_name in os.listdir(dataset_dir):
dir_path = os.path.join(dataset_dir, dir_name)
for path, _, files in os.walk(dir_path):
for file in files:
image_path = os.path.join(path, file)
image = read_image(image_path)
image_path_list.append(image_path)
image_list.append(base64.encodebytes(image).decode("utf-8"))
image_data = [[image, ""] for image in image_list]
batch_df = pd.DataFrame(image_data, columns=["image", "text"])
image_csv_folder_path = os.path.join(dataset_dir, "image_batch")
os.makedirs(image_csv_folder_path, exist_ok=True)
# Divide this into files of 10 rows each
batch_size_per_predict = 10
for i in range(0, len(batch_df), batch_size_per_predict):
j = i + batch_size_per_predict
batch_df[i:j].to_csv(
os.path.join(image_csv_folder_path, str(i) + batch_input_file)
)
# Generate batch input for text embeddings
# supply strings describing the images
text_data = [
["", "a photo of a " + os.path.basename(os.path.dirname(image_path))]
for image_path in image_path_list
]
batch_df = pd.DataFrame(text_data, columns=["image", "text"])
text_csv_folder_path = os.path.join(dataset_dir, "text_batch")
os.makedirs(text_csv_folder_path, exist_ok=True)
# Divide this into files of 10 rows each
batch_size_per_predict = 10
for i in range(0, len(batch_df), batch_size_per_predict):
j = i + batch_size_per_predict
batch_df[i:j].to_csv(
os.path.join(text_csv_folder_path, str(i) + batch_input_file)
)
# Generate batch input for image and text embeddings
# supply base64 images for images samples and random strings for text samples
image_text_data = [
[image_list[i], "a photo of a " + os.path.basename(os.path.dirname(image_path))]
for i in range(len(image_list))
]
batch_df = pd.DataFrame(image_text_data, columns=["image", "text"])
image_text_csv_folder_path = os.path.join(dataset_dir, "image_text_batch")
os.makedirs(image_text_csv_folder_path, exist_ok=True)
# Divide this into files of 10 rows each
batch_size_per_predict = 10
for i in range(0, len(batch_df), batch_size_per_predict):
j = i + batch_size_per_predict
batch_df[i:j].to_csv(
os.path.join(image_text_csv_folder_path, str(i) + batch_input_file)
)