def upload_data_and_create_jsonl_mltable_files()

in cli/jobs/pipelines/automl/image-instance-segmentation-task-fridge-items-pipeline/prepare_data.py [0:0]


def upload_data_and_create_jsonl_mltable_files(ml_client, dataset_parent_dir):
    # Download data from public url

    # create data folder if it doesnt exist.
    os.makedirs(dataset_parent_dir, exist_ok=True)

    # download data
    download_url = "https://automlsamplenotebookdata-adcuc7f7bqhhh8a4.b02.azurefd.net/image-instance-segmentation/odFridgeObjectsMask.zip"

    # Extract current dataset name from dataset url
    dataset_name = os.path.basename(download_url).split(".")[0]
    # Get dataset path for later use
    dataset_dir = os.path.join(dataset_parent_dir, dataset_name)

    # Get the data zip file path
    data_file = os.path.join(dataset_parent_dir, f"{dataset_name}.zip")

    # Download the dataset
    urllib.request.urlretrieve(download_url, filename=data_file)

    # extract files
    with ZipFile(data_file, "r") as zip:
        print("extracting files...")
        zip.extractall(path=dataset_parent_dir)
        print("done")
    # delete zip file
    os.remove(data_file)

    # Upload data and create a data asset URI folder
    print("Uploading data to blob storage")
    my_data = Data(
        path=dataset_dir,
        type=AssetTypes.URI_FOLDER,
        description="Fridge-items images instance segmentation",
        name="fridge-items-images-is-p",
    )

    uri_folder_data_asset = ml_client.data.create_or_update(my_data)

    print(uri_folder_data_asset)
    print("")
    print("Path to folder in Blob Storage:")
    print(uri_folder_data_asset.path)

    print("Installing scikit-image and simplification package")
    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", "scikit-image==0.25.0"]
    )
    # Install numpy version compatible with scikit-image==0.25.0.
    subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4"])
    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", "simplification==0.7.12"]
    )
    print("done")

    print("Creating jsonl files")
    from jsonl_converter import convert_mask_in_VOC_to_jsonl

    convert_mask_in_VOC_to_jsonl(dataset_dir, uri_folder_data_asset.path)
    print("done")

    # We'll copy each JSONL file within its related MLTable folder
    training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder")
    validation_mltable_path = os.path.join(
        dataset_parent_dir, "validation-mltable-folder"
    )

    # First, let's create the folders if they don't exist
    os.makedirs(training_mltable_path, exist_ok=True)
    os.makedirs(validation_mltable_path, exist_ok=True)

    # Path to the training and validation files
    train_annotations_file = os.path.join(
        training_mltable_path, "train_annotations.jsonl"
    )
    validation_annotations_file = os.path.join(
        validation_mltable_path, "validation_annotations.jsonl"
    )

    # Create and save train mltable
    train_mltable_file_contents = create_ml_table_file(
        os.path.basename(train_annotations_file)
    )
    save_ml_table_file(training_mltable_path, train_mltable_file_contents)

    # Create and save validation mltable
    validation_mltable_file_contents = create_ml_table_file(
        os.path.basename(validation_annotations_file)
    )
    save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)