def create_jsonl_and_mltable_files()

in cli/foundation-models/system/finetune/image-classification/multiclass-classification/prepare_data.py [0:0]


def create_jsonl_and_mltable_files(uri_folder_data_path, dataset_dir):
    print("Creating jsonl files")

    dataset_parent_dir = os.path.dirname(dataset_dir)

    # We will copy each JSONL file within its related MLTable folder
    training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder")
    validation_mltable_path = os.path.join(
        dataset_parent_dir, "validation-mltable-folder"
    )

    # Create MLTable folders, if they don't exist
    os.makedirs(training_mltable_path, exist_ok=True)
    os.makedirs(validation_mltable_path, exist_ok=True)

    train_validation_ratio = 5

    # Path to the training and validation files
    train_annotations_file = os.path.join(
        training_mltable_path, "train_annotations.jsonl"
    )
    validation_annotations_file = os.path.join(
        validation_mltable_path, "validation_annotations.jsonl"
    )

    # Baseline of json line dictionary
    json_line_sample = {"image_url": uri_folder_data_path, "label": ""}

    index = 0
    # Scan each sub directary and generate a jsonl line per image, distributed on train and valid JSONL files
    with open(train_annotations_file, "w") as train_f:
        with open(validation_annotations_file, "w") as validation_f:
            for class_name in os.listdir(dataset_dir):
                sub_dir = os.path.join(dataset_dir, class_name)
                if not os.path.isdir(sub_dir):
                    continue

                # Scan each sub directary
                print(f"Parsing {sub_dir}")
                for image in os.listdir(sub_dir):
                    json_line = dict(json_line_sample)
                    json_line["image_url"] += f"{class_name}/{image}"
                    json_line["label"] = class_name

                    if index % train_validation_ratio == 0:
                        # Validation annotation
                        validation_f.write(json.dumps(json_line) + "\n")
                    else:
                        # Train annotation
                        train_f.write(json.dumps(json_line) + "\n")
                    index += 1
    print("done")

    # Create and save train mltable
    train_mltable_file_contents = create_ml_table_file(
        os.path.basename(train_annotations_file)
    )
    save_ml_table_file(training_mltable_path, train_mltable_file_contents)

    # Create and save validation mltable
    validation_mltable_file_contents = create_ml_table_file(
        os.path.basename(validation_annotations_file)
    )
    save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)