cli/jobs/pipelines/automl/image-object-detection-task-fridge-items-pipeline/prepare_data.py (151 lines of code) (raw):
import argparse
import json
import os
import urllib
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from azure.identity import InteractiveBrowserCredential
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
def create_ml_table_file(filename):
"""Create ML Table definition"""
return (
"paths:\n"
" - file: ./{0}\n"
"transformations:\n"
" - read_json_lines:\n"
" encoding: utf8\n"
" invalid_lines: error\n"
" include_path_column: false\n"
" - convert_column_types:\n"
" - columns: image_url\n"
" column_type: stream_info"
).format(filename)
def save_ml_table_file(output_path, mltable_file_contents):
with open(os.path.join(output_path, "MLTable"), "w") as f:
f.write(mltable_file_contents)
def create_jsonl_and_mltable_files(uri_folder_data_path, dataset_dir):
print("Creating jsonl files")
dataset_parent_dir = os.path.dirname(dataset_dir)
# We'll copy each JSONL file within its related MLTable folder
training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder")
validation_mltable_path = os.path.join(
dataset_parent_dir, "validation-mltable-folder"
)
# Create MLTable folders, if they don't exist
os.makedirs(training_mltable_path, exist_ok=True)
os.makedirs(validation_mltable_path, exist_ok=True)
train_validation_ratio = 5
# Path to the training and validation files
train_annotations_file = os.path.join(
training_mltable_path, "train_annotations.jsonl"
)
validation_annotations_file = os.path.join(
validation_mltable_path, "validation_annotations.jsonl"
)
# Baseline of json line dictionary
json_line_sample = {
"image_url": uri_folder_data_path,
"image_details": {"format": None, "width": None, "height": None},
"label": [],
}
# Path to the annotations
annotations_folder = os.path.join(dataset_dir, "annotations")
# Read each annotation and convert it to jsonl line
with open(train_annotations_file, "w") as train_f:
with open(validation_annotations_file, "w") as validation_f:
for i, filename in enumerate(os.listdir(annotations_folder)):
if not filename.endswith(".xml"):
print("Skipping unknown file: {}".format(filename))
continue
annotations_file_path = os.path.join(annotations_folder, filename)
print(f"Parsing {os.path.join(annotations_folder, filename)}")
root = ET.parse(annotations_file_path).getroot()
width = int(root.find("size/width").text)
height = int(root.find("size/height").text)
labels = []
for object in root.findall("object"):
name = object.find("name").text
xmin = object.find("bndbox/xmin").text
ymin = object.find("bndbox/ymin").text
xmax = object.find("bndbox/xmax").text
ymax = object.find("bndbox/ymax").text
isCrowd = int(object.find("difficult").text)
labels.append(
{
"label": name,
"topX": float(xmin) / width,
"topY": float(ymin) / height,
"bottomX": float(xmax) / width,
"bottomY": float(ymax) / height,
"isCrowd": isCrowd,
}
)
# build the jsonl file
image_filename = root.find("filename").text
_, file_extension = os.path.splitext(image_filename)
json_line = dict(json_line_sample)
json_line["image_url"] = (
json_line["image_url"] + "images/" + image_filename
)
json_line["image_details"]["format"] = file_extension[1:]
json_line["image_details"]["width"] = width
json_line["image_details"]["height"] = height
json_line["label"] = labels
if i % train_validation_ratio == 0:
# validation annotation
validation_f.write(json.dumps(json_line) + "\n")
else:
# train annotation
train_f.write(json.dumps(json_line) + "\n")
print("done")
# Create and save train mltable
train_mltable_file_contents = create_ml_table_file(
os.path.basename(train_annotations_file)
)
save_ml_table_file(training_mltable_path, train_mltable_file_contents)
# Create and save validation mltable
validation_mltable_file_contents = create_ml_table_file(
os.path.basename(validation_annotations_file)
)
save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)
def upload_data_and_create_jsonl_mltable_files(ml_client, dataset_parent_dir):
# Download data from public url
# create data folder if it doesnt exist.
os.makedirs(dataset_parent_dir, exist_ok=True)
# download data
download_url = "https://automlsamplenotebookdata-adcuc7f7bqhhh8a4.b02.azurefd.net/image-object-detection/odFridgeObjects.zip"
# Extract current dataset name from dataset url
dataset_name = os.path.basename(download_url).split(".")[0]
# Get dataset path for later use
dataset_dir = os.path.join(dataset_parent_dir, dataset_name)
# Get the data zip file path
data_file = os.path.join(dataset_parent_dir, f"{dataset_name}.zip")
# Download the dataset
urllib.request.urlretrieve(download_url, filename=data_file)
# extract files
with ZipFile(data_file, "r") as zip:
print("extracting files...")
zip.extractall(path=dataset_parent_dir)
print("done")
# delete zip file
os.remove(data_file)
# Upload data and create a data asset URI folder
print("Uploading data to blob storage")
my_data = Data(
path=dataset_dir,
type=AssetTypes.URI_FOLDER,
description="Fridge-items images Object detection",
name="fridge-items-images-od-p",
)
uri_folder_data_asset = ml_client.data.create_or_update(my_data)
print(uri_folder_data_asset)
print("")
print("Path to folder in Blob Storage:")
print(uri_folder_data_asset.path)
create_jsonl_and_mltable_files(
uri_folder_data_path=uri_folder_data_asset.path, dataset_dir=dataset_dir
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Prepare data for image classification"
)
parser.add_argument("--subscription", type=str, help="Subscription ID")
parser.add_argument("--group", type=str, help="Resource group name")
parser.add_argument("--workspace", type=str, help="Workspace name")
parser.add_argument(
"--data_path", type=str, default="./data", help="Dataset location"
)
args, unknown = parser.parse_known_args()
args_dict = vars(args)
credential = InteractiveBrowserCredential()
ml_client = None
try:
ml_client = MLClient.from_config(credential)
except Exception as ex:
# Enter details of your AML workspace
subscription_id = args.subscription
resource_group = args.group
workspace = args.workspace
ml_client = MLClient(credential, subscription_id, resource_group, workspace)
upload_data_and_create_jsonl_mltable_files(
ml_client=ml_client, dataset_parent_dir=args.data_path
)