in cli/foundation-models/system/finetune/image-classification/multiclass-classification/prepare_data.py [0:0]
def create_jsonl_and_mltable_files(uri_folder_data_path, dataset_dir):
print("Creating jsonl files")
dataset_parent_dir = os.path.dirname(dataset_dir)
# We will copy each JSONL file within its related MLTable folder
training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder")
validation_mltable_path = os.path.join(
dataset_parent_dir, "validation-mltable-folder"
)
# Create MLTable folders, if they don't exist
os.makedirs(training_mltable_path, exist_ok=True)
os.makedirs(validation_mltable_path, exist_ok=True)
train_validation_ratio = 5
# Path to the training and validation files
train_annotations_file = os.path.join(
training_mltable_path, "train_annotations.jsonl"
)
validation_annotations_file = os.path.join(
validation_mltable_path, "validation_annotations.jsonl"
)
# Baseline of json line dictionary
json_line_sample = {"image_url": uri_folder_data_path, "label": ""}
index = 0
# Scan each sub directary and generate a jsonl line per image, distributed on train and valid JSONL files
with open(train_annotations_file, "w") as train_f:
with open(validation_annotations_file, "w") as validation_f:
for class_name in os.listdir(dataset_dir):
sub_dir = os.path.join(dataset_dir, class_name)
if not os.path.isdir(sub_dir):
continue
# Scan each sub directary
print(f"Parsing {sub_dir}")
for image in os.listdir(sub_dir):
json_line = dict(json_line_sample)
json_line["image_url"] += f"{class_name}/{image}"
json_line["label"] = class_name
if index % train_validation_ratio == 0:
# Validation annotation
validation_f.write(json.dumps(json_line) + "\n")
else:
# Train annotation
train_f.write(json.dumps(json_line) + "\n")
index += 1
print("done")
# Create and save train mltable
train_mltable_file_contents = create_ml_table_file(
os.path.basename(train_annotations_file)
)
save_ml_table_file(training_mltable_path, train_mltable_file_contents)
# Create and save validation mltable
validation_mltable_file_contents = create_ml_table_file(
os.path.basename(validation_annotations_file)
)
save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)