in cli/foundation-models/system/finetune/image-classification/multilabel-classification/prepare_data.py [0:0]
def create_jsonl_and_mltable_files(uri_folder_data_path, dataset_dir):
print("Creating jsonl files")
dataset_parent_dir = os.path.dirname(dataset_dir)
# We will copy each JSONL file within its related MLTable folder
training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder")
validation_mltable_path = os.path.join(
dataset_parent_dir, "validation-mltable-folder"
)
# Create MLTable folders, if they don't exist
os.makedirs(training_mltable_path, exist_ok=True)
os.makedirs(validation_mltable_path, exist_ok=True)
train_validation_ratio = 5
# Path to the training and validation files
train_annotations_file = os.path.join(
training_mltable_path, "train_annotations.jsonl"
)
validation_annotations_file = os.path.join(
validation_mltable_path, "validation_annotations.jsonl"
)
# Path to the labels file.
label_file = os.path.join(dataset_dir, "labels.csv")
# Baseline of json line dictionary
json_line_sample = {"image_url": uri_folder_data_path, "label": ""}
index = 0
# Read each annotation and convert it to jsonl line
with open(train_annotations_file, "w") as train_f:
with open(validation_annotations_file, "w") as validation_f:
with open(label_file, "r") as labels:
for i, line in enumerate(labels):
# Skipping the title line and any empty lines.
if i == 0 or len(line.strip()) == 0:
continue
line_split = line.strip().split(",")
if len(line_split) != 2:
print("Skipping the invalid line: {}".format(line))
continue
json_line = dict(json_line_sample)
json_line["image_url"] += f"images/{line_split[0]}"
json_line["label"] = line_split[1].strip().split(" ")
if i % train_validation_ratio == 0:
# Validation annotation
validation_f.write(json.dumps(json_line) + "\n")
else:
# Train annotation
train_f.write(json.dumps(json_line) + "\n")
print("done")
# Create and save train mltable
train_mltable_file_contents = create_ml_table_file(
os.path.basename(train_annotations_file)
)
save_ml_table_file(training_mltable_path, train_mltable_file_contents)
# Create and save validation mltable
validation_mltable_file_contents = create_ml_table_file(
os.path.basename(validation_annotations_file)
)
save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)