training_kd.ipynb (571 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Fine-tune/Evaluate/Quantize Open Source SLM/LLM using the torchtune on Azure ML\n",
"\n",
"## Knowledge Distillation\n",
"\n",
"[Note] Please use `Python 3.10 - SDK v2 (azureml_py310_sdkv2)` conda environment.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load config file\n",
"\n",
"---\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os, sys\n",
"from utils.aml_common import (\n",
" check_kernel, \n",
" get_or_create_environment_asset,\n",
" get_or_create_docker_environment_asset, \n",
" get_or_create_data_asset\n",
")\n",
"\n",
"check_kernel()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import yaml\n",
"from utils.logger import logger\n",
"from datetime import datetime\n",
"\n",
"snapshot_date = datetime.now().strftime(\"%Y-%m-%d\")\n",
"\n",
"with open(\"config.yml\") as f:\n",
" d = yaml.load(f, Loader=yaml.FullLoader)\n",
"\n",
"AZURE_SUBSCRIPTION_ID = d[\"config\"][\"AZURE_SUBSCRIPTION_ID\"]\n",
"AZURE_RESOURCE_GROUP = d[\"config\"][\"AZURE_RESOURCE_GROUP\"]\n",
"AZURE_WORKSPACE = d[\"config\"][\"AZURE_WORKSPACE\"]\n",
"AZURE_SFT_DATA_NAME = d[\"config\"][\"AZURE_SFT_DATA_NAME\"]\n",
"AZURE_DPO_DATA_NAME = d[\"config\"][\"AZURE_DPO_DATA_NAME\"]\n",
"SFT_DATA_DIR = d[\"config\"][\"SFT_DATA_DIR\"]\n",
"DPO_DATA_DIR = d[\"config\"][\"DPO_DATA_DIR\"]\n",
"CLOUD_DIR = d[\"config\"][\"CLOUD_DIR\"]\n",
"HF_MODEL_NAME_OR_PATH = d[\"config\"][\"HF_MODEL_NAME_OR_PATH\"]\n",
"HF_TOKEN = d[\"config\"][\"HF_TOKEN\"]\n",
"IS_DEBUG = d[\"config\"][\"IS_DEBUG\"]\n",
"USE_LOWPRIORITY_VM = d[\"config\"][\"USE_LOWPRIORITY_VM\"]\n",
"USE_BUILTIN_ENV = False\n",
"\n",
"azure_env_name = d[\"train\"][\"azure_env_name\"]\n",
"azure_compute_cluster_name = d[\"train\"][\"azure_compute_cluster_name\"]\n",
"azure_compute_cluster_size = d[\"train\"][\"azure_compute_cluster_size\"]\n",
"\n",
"wandb_api_key = d[\"train\"][\"wandb_api_key\"]\n",
"wandb_project = d[\"train\"][\"wandb_project\"]\n",
"wandb_watch = d[\"train\"][\"wandb_watch\"]\n",
"\n",
"os.makedirs(SFT_DATA_DIR, exist_ok=True)\n",
"os.makedirs(DPO_DATA_DIR, exist_ok=True)\n",
"os.makedirs(CLOUD_DIR, exist_ok=True)\n",
"\n",
"logger.info(\"===== 0. Azure ML Training Info =====\")\n",
"logger.info(f\"AZURE_SUBSCRIPTION_ID={AZURE_SUBSCRIPTION_ID}\")\n",
"logger.info(f\"AZURE_RESOURCE_GROUP={AZURE_RESOURCE_GROUP}\")\n",
"logger.info(f\"AZURE_WORKSPACE={AZURE_WORKSPACE}\")\n",
"logger.info(f\"AZURE_SFT_DATA_NAME={AZURE_SFT_DATA_NAME}\")\n",
"logger.info(f\"AZURE_DPO_DATA_NAME={AZURE_DPO_DATA_NAME}\")\n",
"logger.info(f\"SFT_DATA_DIR={SFT_DATA_DIR}\")\n",
"logger.info(f\"DPO_DATA_DIR={DPO_DATA_DIR}\")\n",
"logger.info(f\"CLOUD_DIR={CLOUD_DIR}\")\n",
"logger.info(f\"HF_MODEL_NAME_OR_PATH={HF_MODEL_NAME_OR_PATH}\")\n",
"logger.info(f\"HF_TOKEN={HF_TOKEN}\")\n",
"logger.info(f\"IS_DEBUG={IS_DEBUG}\")\n",
"logger.info(f\"USE_LOWPRIORITY_VM={USE_LOWPRIORITY_VM}\")\n",
"logger.info(f\"USE_BUILTIN_ENV={USE_BUILTIN_ENV}\")\n",
"\n",
"logger.info(f\"azure_env_name={azure_env_name}\")\n",
"logger.info(f\"azure_compute_cluster_name={azure_compute_cluster_name}\")\n",
"logger.info(f\"azure_compute_cluster_size={azure_compute_cluster_size}\")\n",
"logger.info(f\"wandb_api_key={wandb_api_key}\")\n",
"logger.info(f\"wandb_project={wandb_project}\")\n",
"logger.info(f\"wandb_watch={wandb_watch}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"azure_compute_cluster_name = \"gpu-cluster-a100-test\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<br>\n",
"\n",
"## 1. Dataset preparation\n",
"\n",
"---\n",
"\n",
"For this hands-on, we utilize Hugging Face dataset. But if you would like to build/augment your own dataset, please refer to https://github.com/Azure/synthetic-qa-generation\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from random import randrange\n",
"\n",
"logger.info(f\"===== 1. Dataset preparation =====\")\n",
"logger.info(f\"Loading dataset. It may take several minutes to load the dataset.\")\n",
"\n",
"# Load dataset from the hub\n",
"\n",
"# data_path = \"BCCard/BCCard-Finance-Kor-QnA\"\n",
"# dataset = load_dataset(data_path, split=\"train\")\n",
"\n",
"dataset = load_dataset(\n",
" \"HuggingFaceH4/helpful_instructions\", name=\"self_instruct\", split=\"train[:10%]\"\n",
")\n",
"dataset = dataset.rename_column(\"prompt\", \"instruction\")\n",
"dataset = dataset.rename_column(\"completion\", \"output\")\n",
"\n",
"print(f\"Loaded Dataset size: {len(dataset)}\")\n",
"\n",
"if IS_DEBUG:\n",
" logger.info(f\"Activated Debug mode. The number of sample was resampled to 1000.\")\n",
" dataset = dataset.select(range(800))\n",
" print(f\"Debug Dataset size: {len(dataset)}\")\n",
"\n",
"logger.info(f\"Save dataset to {SFT_DATA_DIR}\")\n",
"dataset = dataset.train_test_split(test_size=0.2)\n",
"train_dataset = dataset[\"train\"]\n",
"train_dataset.to_json(f\"{SFT_DATA_DIR}/train.jsonl\", force_ascii=False)\n",
"test_dataset = dataset[\"test\"]\n",
"test_dataset.to_json(f\"{SFT_DATA_DIR}/eval.jsonl\", force_ascii=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<br>\n",
"\n",
"## 2. Training preparation\n",
"\n",
"---\n",
"\n",
"### 2.1. Configure workspace details\n",
"\n",
"To connect to a workspace, we need identifying parameters - a subscription, a resource group, and a workspace name. We will use these details in the MLClient from azure.ai.ml to get a handle on the Azure Machine Learning workspace we need. We will use the default Azure authentication for this hands-on.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import time\n",
"from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n",
"from azure.ai.ml import MLClient, Input\n",
"\n",
"logger.info(f\"===== 2. Training preparation =====\")\n",
"logger.info(f\"Calling DefaultAzureCredential.\")\n",
"credential = DefaultAzureCredential()\n",
"# credential = InteractiveBrowserCredential()\n",
"ml_client = MLClient(\n",
" credential, AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, AZURE_WORKSPACE\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"### 2.2. Create AzureML environment and data\n",
"\n",
"Azure ML defines containers (called environment asset) in which your code will run. We can use the built-in environment or build a custom environment (Docker container, conda).\n",
"This hands-on uses conda yaml.\n",
"\n",
"Training data can be used as a dataset stored in the local development environment, but can also be registered as AzureML data.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"env = get_or_create_docker_environment_asset(\n",
" ml_client, azure_env_name, docker_dir=CLOUD_DIR, update=False\n",
")\n",
"data = get_or_create_data_asset(\n",
" ml_client, AZURE_SFT_DATA_NAME, data_local_dir=SFT_DATA_DIR, update=False\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.3. Training script\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# !pygmentize scripts/launcher_distributed_kd.py"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<br>\n",
"\n",
"## 3. Training\n",
"\n",
"---\n",
"\n",
"### 3.1. Create the compute cluster\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from azure.ai.ml.entities import AmlCompute\n",
"\n",
"logger.info(f\"===== 3. Training =====\")\n",
"### Create the compute cluster\n",
"try:\n",
" compute = ml_client.compute.get(azure_compute_cluster_name)\n",
" logger.info(\"The compute cluster already exists! Reusing it for the current run\")\n",
"except Exception as ex:\n",
" logger.info(\n",
" f\"Looks like the compute cluster doesn't exist. Creating a new one with compute size {azure_compute_cluster_size}!\"\n",
" )\n",
" try:\n",
" logger.info(\"Attempt #1 - Trying to create a dedicated compute\")\n",
" tier = \"LowPriority\" if USE_LOWPRIORITY_VM else \"Dedicated\"\n",
" compute = AmlCompute(\n",
" name=azure_compute_cluster_name,\n",
" size=azure_compute_cluster_size,\n",
" tier=tier,\n",
" max_instances=1, # For multi node training set this to an integer value more than 1\n",
" )\n",
" ml_client.compute.begin_create_or_update(compute).wait()\n",
" except Exception as e:\n",
" logger.info(\"Error\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.2. Start training job\n",
"\n",
"The `command` allows user to configure the following key aspects.\n",
"\n",
"- `inputs` - This is the dictionary of inputs using name value pairs to the command.\n",
" - `type` - The type of input. This can be a `uri_file` or `uri_folder`. The default is `uri_folder`.\n",
" - `path` - The path to the file or folder. These can be local or remote files or folders. For remote files - http/https, wasb are supported.\n",
" - Azure ML `data`/`dataset` or `datastore` are of type `uri_folder`. To use `data`/`dataset` as input, you can use registered dataset in the workspace using the format '<data_name>:<version>'. For e.g Input(type='uri_folder', path='my_dataset:1')\n",
" - `mode` - Mode of how the data should be delivered to the compute target. Allowed values are `ro_mount`, `rw_mount` and `download`. Default is `ro_mount`\n",
"- `code` - This is the path where the code to run the command is located\n",
"- `compute` - The compute on which the command will run. You can run it on the local machine by using `local` for the compute.\n",
"- `command` - This is the command that needs to be run\n",
" in the `command` using the `${{inputs.<input_name>}}` expression. To use files or folders as inputs, we can use the `Input` class. The `Input` class supports three parameters:\n",
"- `environment` - This is the environment needed for the command to run. Curated (built-in) or custom environments from the workspace can be used.\n",
"- `instance_count` - Number of nodes. Default is 1.\n",
"- `distribution` - Distribution configuration for distributed training scenarios. Azure Machine Learning supports PyTorch, TensorFlow, and MPI-based distributed.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from azure.ai.ml import command\n",
"from azure.ai.ml import Input, Output\n",
"from azure.ai.ml.entities import ResourceConfiguration\n",
"from utils.aml_common import get_num_gpus\n",
"from azure.ai.ml.entities import Model\n",
"\n",
"num_gpu = get_num_gpus(azure_compute_cluster_size)\n",
"logger.info(f\"Number of GPUs={num_gpu}\")\n",
"\n",
"str_command = \"\"\n",
"if USE_BUILTIN_ENV:\n",
" str_env = \"azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/versions/19\" # Use built-in Environment asset\n",
" str_command += \"pip install -r requirements.txt && \"\n",
"else:\n",
" str_env = f\"{azure_env_name}@latest\" # Use Curated (built-in) Environment asset\n",
"\n",
"if num_gpu > 1:\n",
" tune_recipe = \"knowledge_distillation_distributed\"\n",
" str_command += \"python launcher_distributed_kd.py \"\n",
"else:\n",
" tune_recipe = \"knowledge_distillation_single_device\"\n",
" str_command += \"python launcher_single_kd.py \"\n",
"\n",
"inputs_dict = dict(\n",
" # train_dir=Input(type=\"uri_folder\", path=SFT_DATA_DIR), # Get data from local path\n",
" train_dir=Input(path=f\"{AZURE_SFT_DATA_NAME}@latest\"), # Get data from Data asset\n",
" hf_token=HF_TOKEN,\n",
" tune_recipe=tune_recipe,\n",
" tune_action=\"fine-tune\",\n",
" teacher_model_id=\"Qwen/Qwen2-1.5B-Instruct\",\n",
" student_model_id=\"Qwen/Qwen2-0.5B-Instruct\",\n",
" teacher_model_dir=\"./teacher_model\",\n",
" student_model_dir=\"./student_model\",\n",
" log_dir=\"./outputs/log\",\n",
" model_output_dir=\"./outputs\",\n",
" tune_finetune_yaml=\"knowledge_distillation_qwen2.yaml\", # YOU CAN CHANGE THIS TO YOUR OWN CONFIG FILE\n",
" tune_eval_yaml=\"evaluation_qwen2.yaml\", # YOU CAN CHANGE THIS TO YOUR OWN CONFIG FILE\n",
" tune_quant_yaml=\"quant_qwen2.yaml\", # YOU CAN CHANGE THIS TO YOUR OWN CONFIG FILE\n",
")\n",
"\n",
"if len(wandb_api_key) > 0:\n",
" str_command += \"--wandb_api_key ${{inputs.wandb_api_key}} \\\n",
" --wandb_project ${{inputs.wandb_project}} \\\n",
" --wandb_watch ${{inputs.wandb_watch}} \"\n",
" inputs_dict[\"wandb_api_key\"] = wandb_api_key\n",
" inputs_dict[\"wandb_project\"] = wandb_project\n",
" inputs_dict[\"wandb_watch\"] = wandb_watch\n",
"\n",
"str_command += \"--train_dir ${{inputs.train_dir}} \\\n",
" --hf_token ${{inputs.hf_token}} \\\n",
" --tune_recipe ${{inputs.tune_recipe}} \\\n",
" --tune_action ${{inputs.tune_action}} \\\n",
" --teacher_model_id ${{inputs.teacher_model_id}} \\\n",
" --student_model_id ${{inputs.student_model_id}} \\\n",
" --teacher_model_dir ${{inputs.teacher_model_dir}} \\\n",
" --student_model_dir ${{inputs.student_model_dir}} \\\n",
" --log_dir ${{inputs.log_dir}} \\\n",
" --model_output_dir ${{inputs.model_output_dir}} \\\n",
" --tune_finetune_yaml ${{inputs.tune_finetune_yaml}} \\\n",
" --tune_eval_yaml ${{inputs.tune_eval_yaml}} \\\n",
" --tune_quant_yaml ${{inputs.tune_quant_yaml}}\"\n",
"\n",
"logger.info(f\"Tune recipe: {tune_recipe}\")\n",
"\n",
"job = command(\n",
" inputs=inputs_dict,\n",
" code=\"./scripts\", # local path where the code is stored\n",
" compute=azure_compute_cluster_name,\n",
" command=str_command,\n",
" environment=str_env,\n",
" instance_count=1,\n",
" distribution={\n",
" \"type\": \"PyTorch\",\n",
" \"process_count_per_instance\": num_gpu, # For multi-gpu training set this to an integer value more than 1\n",
" },\n",
")\n",
"\n",
"returned_job = ml_client.jobs.create_or_update(job)\n",
"logger.info(\n",
" \"\"\"Started training job. Now a dedicated Compute Cluster for training is provisioned and the environment\n",
"required for training is automatically set up from Environment.\n",
"\n",
"If you have set up a new custom Environment, it will take approximately 20 minutes or more to set up the Environment before provisioning the training cluster.\n",
"\"\"\"\n",
")\n",
"ml_client.jobs.stream(returned_job.name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"display(returned_job)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# check if the `trained_model` output is available\n",
"job_name = returned_job.name"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"%store job_name"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<br>\n",
"\n",
"## 4. (Optional) Create model asset and get fine-tuned LLM to local folder\n",
"\n",
"---\n",
"\n",
"### 3.1. Create model asset\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from utils.aml_common import get_or_create_model_asset\n",
"\n",
"azure_model_name = d[\"serve\"][\"azure_model_name\"]\n",
"model_dir = d[\"train\"][\"model_dir\"]\n",
"model = get_or_create_model_asset(\n",
" ml_client,\n",
" azure_model_name,\n",
" job_name,\n",
" model_dir,\n",
" model_type=\"custom_model\",\n",
" download_quantized_model_only=False,\n",
" update=False,\n",
")\n",
"\n",
"logger.info(\n",
" \"===== 4. (Optional) Create model asset and get fine-tuned LLM to local folder =====\"\n",
")\n",
"logger.info(f\"azure_model_name={azure_model_name}\")\n",
"logger.info(f\"model_dir={model_dir}\")\n",
"logger.info(f\"model={model}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.2. Get fine-tuned LLM to local folder\n",
"\n",
"You can copy it to your local directory to perform inference or serve the model in Azure environment. (e.g., real-time endpoint)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Download the model (this is optional)\n",
"DOWNLOAD_TO_LOCAL = False\n",
"local_model_dir = \"./artifact_downloads_kd\"\n",
"\n",
"if DOWNLOAD_TO_LOCAL:\n",
" os.makedirs(local_model_dir, exist_ok=True)\n",
" ml_client.models.download(\n",
" name=azure_model_name, download_path=local_model_dir, version=model.version\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clean up\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!rm -rf $SFT_DATA_DIR $DPO_DATA_DIR {local_model_dir}"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py312-dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
},
"microsoft": {
"ms_spell_check": {
"ms_spell_check_language": "en"
}
},
"nteract": {
"version": "nteract-front-end@1.0.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}