in cli/jobs/pipelines/automl/image-instance-segmentation-task-fridge-items-pipeline/prepare_data.py [0:0]
def upload_data_and_create_jsonl_mltable_files(ml_client, dataset_parent_dir):
# Download data from public url
# create data folder if it doesnt exist.
os.makedirs(dataset_parent_dir, exist_ok=True)
# download data
download_url = "https://automlsamplenotebookdata-adcuc7f7bqhhh8a4.b02.azurefd.net/image-instance-segmentation/odFridgeObjectsMask.zip"
# Extract current dataset name from dataset url
dataset_name = os.path.basename(download_url).split(".")[0]
# Get dataset path for later use
dataset_dir = os.path.join(dataset_parent_dir, dataset_name)
# Get the data zip file path
data_file = os.path.join(dataset_parent_dir, f"{dataset_name}.zip")
# Download the dataset
urllib.request.urlretrieve(download_url, filename=data_file)
# extract files
with ZipFile(data_file, "r") as zip:
print("extracting files...")
zip.extractall(path=dataset_parent_dir)
print("done")
# delete zip file
os.remove(data_file)
# Upload data and create a data asset URI folder
print("Uploading data to blob storage")
my_data = Data(
path=dataset_dir,
type=AssetTypes.URI_FOLDER,
description="Fridge-items images instance segmentation",
name="fridge-items-images-is-p",
)
uri_folder_data_asset = ml_client.data.create_or_update(my_data)
print(uri_folder_data_asset)
print("")
print("Path to folder in Blob Storage:")
print(uri_folder_data_asset.path)
print("Installing scikit-image and simplification package")
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "scikit-image==0.25.0"]
)
# Install numpy version compatible with scikit-image==0.25.0.
subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4"])
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "simplification==0.7.12"]
)
print("done")
print("Creating jsonl files")
from jsonl_converter import convert_mask_in_VOC_to_jsonl
convert_mask_in_VOC_to_jsonl(dataset_dir, uri_folder_data_asset.path)
print("done")
# We'll copy each JSONL file within its related MLTable folder
training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder")
validation_mltable_path = os.path.join(
dataset_parent_dir, "validation-mltable-folder"
)
# First, let's create the folders if they don't exist
os.makedirs(training_mltable_path, exist_ok=True)
os.makedirs(validation_mltable_path, exist_ok=True)
# Path to the training and validation files
train_annotations_file = os.path.join(
training_mltable_path, "train_annotations.jsonl"
)
validation_annotations_file = os.path.join(
validation_mltable_path, "validation_annotations.jsonl"
)
# Create and save train mltable
train_mltable_file_contents = create_ml_table_file(
os.path.basename(train_annotations_file)
)
save_ml_table_file(training_mltable_path, train_mltable_file_contents)
# Create and save validation mltable
validation_mltable_file_contents = create_ml_table_file(
os.path.basename(validation_annotations_file)
)
save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)