in cli/jobs/pipelines/automl/image-object-detection-task-fridge-items-pipeline/prepare_data.py [0:0]
def create_jsonl_and_mltable_files(uri_folder_data_path, dataset_dir):
print("Creating jsonl files")
dataset_parent_dir = os.path.dirname(dataset_dir)
# We'll copy each JSONL file within its related MLTable folder
training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder")
validation_mltable_path = os.path.join(
dataset_parent_dir, "validation-mltable-folder"
)
# Create MLTable folders, if they don't exist
os.makedirs(training_mltable_path, exist_ok=True)
os.makedirs(validation_mltable_path, exist_ok=True)
train_validation_ratio = 5
# Path to the training and validation files
train_annotations_file = os.path.join(
training_mltable_path, "train_annotations.jsonl"
)
validation_annotations_file = os.path.join(
validation_mltable_path, "validation_annotations.jsonl"
)
# Baseline of json line dictionary
json_line_sample = {
"image_url": uri_folder_data_path,
"image_details": {"format": None, "width": None, "height": None},
"label": [],
}
# Path to the annotations
annotations_folder = os.path.join(dataset_dir, "annotations")
# Read each annotation and convert it to jsonl line
with open(train_annotations_file, "w") as train_f:
with open(validation_annotations_file, "w") as validation_f:
for i, filename in enumerate(os.listdir(annotations_folder)):
if not filename.endswith(".xml"):
print("Skipping unknown file: {}".format(filename))
continue
annotations_file_path = os.path.join(annotations_folder, filename)
print(f"Parsing {os.path.join(annotations_folder, filename)}")
root = ET.parse(annotations_file_path).getroot()
width = int(root.find("size/width").text)
height = int(root.find("size/height").text)
labels = []
for object in root.findall("object"):
name = object.find("name").text
xmin = object.find("bndbox/xmin").text
ymin = object.find("bndbox/ymin").text
xmax = object.find("bndbox/xmax").text
ymax = object.find("bndbox/ymax").text
isCrowd = int(object.find("difficult").text)
labels.append(
{
"label": name,
"topX": float(xmin) / width,
"topY": float(ymin) / height,
"bottomX": float(xmax) / width,
"bottomY": float(ymax) / height,
"isCrowd": isCrowd,
}
)
# build the jsonl file
image_filename = root.find("filename").text
_, file_extension = os.path.splitext(image_filename)
json_line = dict(json_line_sample)
json_line["image_url"] = (
json_line["image_url"] + "images/" + image_filename
)
json_line["image_details"]["format"] = file_extension[1:]
json_line["image_details"]["width"] = width
json_line["image_details"]["height"] = height
json_line["label"] = labels
if i % train_validation_ratio == 0:
# validation annotation
validation_f.write(json.dumps(json_line) + "\n")
else:
# train annotation
train_f.write(json.dumps(json_line) + "\n")
print("done")
# Create and save train mltable
train_mltable_file_contents = create_ml_table_file(
os.path.basename(train_annotations_file)
)
save_ml_table_file(training_mltable_path, train_mltable_file_contents)
# Create and save validation mltable
validation_mltable_file_contents = create_ml_table_file(
os.path.basename(validation_annotations_file)
)
save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)