def create_jsonl_and_mltable_files()

in cli/jobs/automl-standalone-jobs/cli-automl-image-classification-multilabel-task-fridge-items/prepare_data.py [0:0]


def create_jsonl_and_mltable_files(uri_folder_data_path, dataset_dir):
    print("Creating jsonl files")

    dataset_parent_dir = os.path.dirname(dataset_dir)

    # We'll copy each JSONL file within its related MLTable folder
    training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder")
    validation_mltable_path = os.path.join(
        dataset_parent_dir, "validation-mltable-folder"
    )

    # Create MLTable folders, if they don't exist
    os.makedirs(training_mltable_path, exist_ok=True)
    os.makedirs(validation_mltable_path, exist_ok=True)

    train_validation_ratio = 5

    # Path to the training and validation files
    train_annotations_file = os.path.join(
        training_mltable_path, "train_annotations.jsonl"
    )
    validation_annotations_file = os.path.join(
        validation_mltable_path, "validation_annotations.jsonl"
    )

    # Path to the labels file.
    label_file = os.path.join(dataset_dir, "labels.csv")

    # Baseline of json line dictionary
    json_line_sample = {"image_url": uri_folder_data_path, "label": ""}

    index = 0
    # Read each annotation and convert it to jsonl line
    with open(train_annotations_file, "w") as train_f:
        with open(validation_annotations_file, "w") as validation_f:
            with open(label_file, "r") as labels:
                for i, line in enumerate(labels):
                    # Skipping the title line and any empty lines.
                    if i == 0 or len(line.strip()) == 0:
                        continue
                    line_split = line.strip().split(",")
                    if len(line_split) != 2:
                        print("Skipping the invalid line: {}".format(line))
                        continue
                    json_line = dict(json_line_sample)
                    json_line["image_url"] += f"images/{line_split[0]}"
                    json_line["label"] = line_split[1].strip().split(" ")

                    if i % train_validation_ratio == 0:
                        # validation annotation
                        validation_f.write(json.dumps(json_line) + "\n")
                    else:
                        # train annotation
                        train_f.write(json.dumps(json_line) + "\n")
    print("done")

    # Create and save train mltable
    train_mltable_file_contents = create_ml_table_file(
        os.path.basename(train_annotations_file)
    )
    save_ml_table_file(training_mltable_path, train_mltable_file_contents)

    # Create and save validation mltable
    validation_mltable_file_contents = create_ml_table_file(
        os.path.basename(validation_annotations_file)
    )
    save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)