cli/foundation-models/system/inference/image-text-embeddings/prepare_data.py (155 lines of code) (raw):
import argparse
import base64
import json
import os
import shutil
import urllib.request
import pandas as pd
from zipfile import ZipFile
import random
import string
def download_and_unzip(dataset_parent_dir: str) -> None:
"""Download image dataset and unzip it.
:param dataset_parent_dir: dataset parent directory to which dataset will be downloaded
:type dataset_parent_dir: str
"""
# Create directory, if it does not exist
os.makedirs(dataset_parent_dir, exist_ok=True)
# download data
download_url = "https://automlsamplenotebookdata-adcuc7f7bqhhh8a4.b02.azurefd.net/image-classification/fridgeObjects.zip"
print(f"Downloading data from {download_url}")
# Extract current dataset name from dataset url
dataset_name = os.path.basename(download_url).split(".")[0]
# Get dataset path for later use
dataset_dir = os.path.join(dataset_parent_dir, dataset_name)
if os.path.exists(dataset_dir):
shutil.rmtree(dataset_dir)
# Get the name of zip file
data_file = os.path.join(dataset_parent_dir, f"{dataset_name}.zip")
# Download data from public url
urllib.request.urlretrieve(download_url, filename=data_file)
# extract files
with ZipFile(data_file, "r") as zip:
print("extracting files...")
zip.extractall(path=dataset_parent_dir)
print("done")
# delete zip file
os.remove(data_file)
return dataset_dir
def read_image(image_path: str) -> bytes:
"""Read image from path.
:param image_path: image path
:type image_path: str
:return: image in bytes format
:rtype: bytes
"""
with open(image_path, "rb") as f:
return f.read()
def prepare_data_for_online_inference(dataset_dir: str) -> None:
"""Prepare request json files for online inference.
:param dataset_dir: dataset directory
:type dataset_dir: str
"""
sample_image_1 = os.path.join(dataset_dir, "milk_bottle", "99.jpg")
sample_image_2 = os.path.join(dataset_dir, "can", "1.jpg")
# Generate sample request for image embeddings
image_request_json = {
"input_data": {
"columns": ["image", "text"],
"index": [0, 1],
"data": [
[
base64.encodebytes(read_image(sample_image_1)).decode("utf-8"),
"",
], # the "text" column should contain empty string
[base64.encodebytes(read_image(sample_image_2)).decode("utf-8"), ""],
],
}
}
request_file_name = os.path.join(dataset_dir, "image_request_data.json")
with open(request_file_name, "w") as request_file:
json.dump(image_request_json, request_file)
# Generate sample request for text embeddings
text_request_json = {
"input_data": {
"columns": ["image", "text"],
"index": [0, 1],
"data": [
[
"",
"a photo of a milk bottle",
], # the "image" column should contain empty string
["", "a photo of a metal can"],
],
}
}
request_file_name = os.path.join(dataset_dir, "text_request_data.json")
with open(request_file_name, "w") as request_file:
json.dump(text_request_json, request_file)
# Generate sample request for image and text embeddings
image_text_request_json = {
"input_data": {
"columns": ["image", "text"],
"index": [0, 1],
"data": [
[
base64.encodebytes(read_image(sample_image_1)).decode("utf-8"),
"a photo of a milk bottle",
], # all rows should have both images and text
[
base64.encodebytes(read_image(sample_image_2)).decode("utf-8"),
"a photo of a metal can",
],
],
}
}
request_file_name = os.path.join(dataset_dir, "image_text_request_data.json")
with open(request_file_name, "w") as request_file:
json.dump(image_text_request_json, request_file)
def prepare_data_for_batch_inference(dataset_dir: str) -> None:
"""Prepare image folder and csv files for batch inference.
This function will move all images to a single image folder and also create folders of csv
files. Each folder will have csv files that contain images in base64 format, text samples, or both.
:param dataset_dir: dataset directory
:type dataset_dir: str
"""
batch_input_file = "batch_input.csv"
# Generate batch input for image embeddings
image_list = []
image_path_list = []
for dir_name in os.listdir(dataset_dir):
dir_path = os.path.join(dataset_dir, dir_name)
for path, _, files in os.walk(dir_path):
for file in files:
image_path = os.path.join(path, file)
image = read_image(image_path)
image_path_list.append(image_path)
image_list.append(base64.encodebytes(image).decode("utf-8"))
image_data = [[image, ""] for image in image_list]
batch_df = pd.DataFrame(image_data, columns=["image", "text"])
image_csv_folder_path = os.path.join(dataset_dir, "image_batch")
os.makedirs(image_csv_folder_path, exist_ok=True)
# Divide this into files of 10 rows each
batch_size_per_predict = 10
for i in range(0, len(batch_df), batch_size_per_predict):
j = i + batch_size_per_predict
batch_df[i:j].to_csv(
os.path.join(image_csv_folder_path, str(i) + batch_input_file)
)
# Generate batch input for text embeddings
# supply strings describing the images
text_data = [
["", "a photo of a " + os.path.basename(os.path.dirname(image_path))]
for image_path in image_path_list
]
batch_df = pd.DataFrame(text_data, columns=["image", "text"])
text_csv_folder_path = os.path.join(dataset_dir, "text_batch")
os.makedirs(text_csv_folder_path, exist_ok=True)
# Divide this into files of 10 rows each
batch_size_per_predict = 10
for i in range(0, len(batch_df), batch_size_per_predict):
j = i + batch_size_per_predict
batch_df[i:j].to_csv(
os.path.join(text_csv_folder_path, str(i) + batch_input_file)
)
# Generate batch input for image and text embeddings
# supply base64 images for images samples and random strings for text samples
image_text_data = [
[image_list[i], "a photo of a " + os.path.basename(os.path.dirname(image_path))]
for i in range(len(image_list))
]
batch_df = pd.DataFrame(image_text_data, columns=["image", "text"])
image_text_csv_folder_path = os.path.join(dataset_dir, "image_text_batch")
os.makedirs(image_text_csv_folder_path, exist_ok=True)
# Divide this into files of 10 rows each
batch_size_per_predict = 10
for i in range(0, len(batch_df), batch_size_per_predict):
j = i + batch_size_per_predict
batch_df[i:j].to_csv(
os.path.join(image_text_csv_folder_path, str(i) + batch_input_file)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Prepare data for zero-shot image classification"
)
parser.add_argument(
"--data_path", type=str, default="data", help="Dataset location"
)
parser.add_argument(
"--mode",
type=str,
default="online",
help="prepare data for online or batch inference",
)
args, unknown = parser.parse_known_args()
args_dict = vars(args)
dataset_dir = download_and_unzip(
dataset_parent_dir=os.path.join(
os.path.dirname(os.path.realpath(__file__)), args.data_path
),
)
if args.mode == "online":
prepare_data_for_online_inference(dataset_dir=dataset_dir)
else:
prepare_data_for_batch_inference(dataset_dir=dataset_dir)