# Fine-tune/Evaluate/Quantize Open Source SLM/LLM using the torchtune on Azure ML

## Supervised Fine-tuning (SFT)

[Note] Please use `Python 3.10 - SDK v2 (azureml_py310_sdkv2)` conda environment.


## Load config file

---


In [None]:
%load_ext autoreload
%autoreload 2
import os, sys
from utils.aml_common import (
    check_kernel, 
    get_or_create_environment_asset,
    get_or_create_docker_environment_asset, 
    get_or_create_data_asset
)

check_kernel()

In [None]:
import yaml
from utils.logger import logger
from datetime import datetime

snapshot_date = datetime.now().strftime("%Y-%m-%d")

with open("config.yml") as f:
    d = yaml.load(f, Loader=yaml.FullLoader)

AZURE_SUBSCRIPTION_ID = d["config"]["AZURE_SUBSCRIPTION_ID"]
AZURE_RESOURCE_GROUP = d["config"]["AZURE_RESOURCE_GROUP"]
AZURE_WORKSPACE = d["config"]["AZURE_WORKSPACE"]
AZURE_SFT_DATA_NAME = d["config"]["AZURE_SFT_DATA_NAME"]
AZURE_DPO_DATA_NAME = d["config"]["AZURE_DPO_DATA_NAME"]
SFT_DATA_DIR = d["config"]["SFT_DATA_DIR"]
DPO_DATA_DIR = d["config"]["DPO_DATA_DIR"]
CLOUD_DIR = d["config"]["CLOUD_DIR"]
HF_MODEL_NAME_OR_PATH = d["config"]["HF_MODEL_NAME_OR_PATH"]
HF_TOKEN = d["config"]["HF_TOKEN"]
IS_DEBUG = d["config"]["IS_DEBUG"]
USE_LOWPRIORITY_VM = d["config"]["USE_LOWPRIORITY_VM"]
USE_BUILTIN_ENV = False

azure_env_name = d["train"]["azure_env_name"]
azure_compute_cluster_name = d["train"]["azure_compute_cluster_name"]
azure_compute_cluster_size = d["train"]["azure_compute_cluster_size"]

wandb_api_key = d["train"]["wandb_api_key"]
wandb_project = d["train"]["wandb_project"]
wandb_watch = d["train"]["wandb_watch"]

os.makedirs(SFT_DATA_DIR, exist_ok=True)
os.makedirs(DPO_DATA_DIR, exist_ok=True)
os.makedirs(CLOUD_DIR, exist_ok=True)

logger.info("===== 0. Azure ML Training Info =====")
logger.info(f"AZURE_SUBSCRIPTION_ID={AZURE_SUBSCRIPTION_ID}")
logger.info(f"AZURE_RESOURCE_GROUP={AZURE_RESOURCE_GROUP}")
logger.info(f"AZURE_WORKSPACE={AZURE_WORKSPACE}")
logger.info(f"AZURE_SFT_DATA_NAME={AZURE_SFT_DATA_NAME}")
logger.info(f"AZURE_DPO_DATA_NAME={AZURE_DPO_DATA_NAME}")
logger.info(f"SFT_DATA_DIR={SFT_DATA_DIR}")
logger.info(f"DPO_DATA_DIR={DPO_DATA_DIR}")
logger.info(f"CLOUD_DIR={CLOUD_DIR}")
logger.info(f"HF_MODEL_NAME_OR_PATH={HF_MODEL_NAME_OR_PATH}")
logger.info(f"HF_TOKEN={HF_TOKEN}")
logger.info(f"IS_DEBUG={IS_DEBUG}")
logger.info(f"USE_LOWPRIORITY_VM={USE_LOWPRIORITY_VM}")
logger.info(f"USE_BUILTIN_ENV={USE_BUILTIN_ENV}")

logger.info(f"azure_env_name={azure_env_name}")
logger.info(f"azure_compute_cluster_name={azure_compute_cluster_name}")
logger.info(f"azure_compute_cluster_size={azure_compute_cluster_size}")
logger.info(f"wandb_api_key={wandb_api_key}")
logger.info(f"wandb_project={wandb_project}")
logger.info(f"wandb_watch={wandb_watch}")

<br>

## 1. Dataset preparation

---

For this hands-on, we utilize Hugging Face dataset. But if you would like to build/augment your own dataset, please refer to https://github.com/Azure/synthetic-qa-generation


In [None]:
from datasets import load_dataset
from random import randrange

logger.info(f"===== 1. Dataset preparation =====")
logger.info(f"Loading dataset. It may take several minutes to load the dataset.")

# Load dataset from the hub

# data_path = "BCCard/BCCard-Finance-Kor-QnA"
# dataset = load_dataset(data_path, split="train")

dataset = load_dataset(
    "HuggingFaceH4/helpful_instructions", name="self_instruct", split="train[:10%]"
)
dataset = dataset.rename_column("prompt", "instruction")
dataset = dataset.rename_column("completion", "output")

print(f"Loaded Dataset size: {len(dataset)}")

if IS_DEBUG:
    logger.info(f"Activated Debug mode. The number of sample was resampled to 1000.")
    dataset = dataset.select(range(800))
    print(f"Debug Dataset size: {len(dataset)}")

logger.info(f"Save dataset to {SFT_DATA_DIR}")
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
train_dataset.to_json(f"{SFT_DATA_DIR}/train.jsonl", force_ascii=False)
test_dataset = dataset["test"]
test_dataset.to_json(f"{SFT_DATA_DIR}/eval.jsonl", force_ascii=False)

<br>

## 2. Training preparation

---

### 2.1. Configure workspace details

To connect to a workspace, we need identifying parameters - a subscription, a resource group, and a workspace name. We will use these details in the MLClient from azure.ai.ml to get a handle on the Azure Machine Learning workspace we need. We will use the default Azure authentication for this hands-on.


In [None]:
import time
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input

logger.info(f"===== 2. Training preparation =====")
logger.info(f"Calling DefaultAzureCredential.")

credential = DefaultAzureCredential()
# credential = InteractiveBrowserCredential()

ml_client = MLClient(
    credential, AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, AZURE_WORKSPACE
)

### 2.2. Create AzureML environment and data

Azure ML defines containers (called environment asset) in which your code will run. We can use the built-in environment or build a custom environment (Docker container, conda).
This hands-on uses conda yaml.

Training data can be used as a dataset stored in the local development environment, but can also be registered as AzureML data.


In [None]:
env = get_or_create_docker_environment_asset(
    ml_client, azure_env_name, docker_dir=CLOUD_DIR, update=False
)
data = get_or_create_data_asset(
    ml_client, AZURE_SFT_DATA_NAME, data_local_dir=SFT_DATA_DIR, update=False
)

### 2.3. Training script


In [None]:
# !pygmentize scripts/launcher_distributed.py

<br>

## 3. Training

---

### 3.1. Create the compute cluster


In [None]:
from azure.ai.ml.entities import AmlCompute

logger.info(f"===== 3. Training =====")
### Create the compute cluster
try:
    compute = ml_client.compute.get(azure_compute_cluster_name)
    logger.info("The compute cluster already exists! Reusing it for the current run")
except Exception as ex:
    logger.info(
        f"Looks like the compute cluster doesn't exist. Creating a new one with compute size {azure_compute_cluster_size}!"
    )
    try:
        logger.info("Attempt #1 - Trying to create a dedicated compute")
        tier = "LowPriority" if USE_LOWPRIORITY_VM else "Dedicated"
        compute = AmlCompute(
            name=azure_compute_cluster_name,
            size=azure_compute_cluster_size,
            tier=tier,
            max_instances=1,  # For multi node training set this to an integer value more than 1
        )
        ml_client.compute.begin_create_or_update(compute).wait()
    except Exception as e:
        logger.info("Error")

### 3.2. Start training job

The `command` allows user to configure the following key aspects.

-   `inputs` - This is the dictionary of inputs using name value pairs to the command.
    -   `type` - The type of input. This can be a `uri_file` or `uri_folder`. The default is `uri_folder`.
    -   `path` - The path to the file or folder. These can be local or remote files or folders. For remote files - http/https, wasb are supported.
        -   Azure ML `data`/`dataset` or `datastore` are of type `uri_folder`. To use `data`/`dataset` as input, you can use registered dataset in the workspace using the format '<data_name>:<version>'. For e.g Input(type='uri_folder', path='my_dataset:1')
    -   `mode` - Mode of how the data should be delivered to the compute target. Allowed values are `ro_mount`, `rw_mount` and `download`. Default is `ro_mount`
-   `code` - This is the path where the code to run the command is located
-   `compute` - The compute on which the command will run. You can run it on the local machine by using `local` for the compute.
-   `command` - This is the command that needs to be run
    in the `command` using the `${{inputs.<input_name>}}` expression. To use files or folders as inputs, we can use the `Input` class. The `Input` class supports three parameters:
-   `environment` - This is the environment needed for the command to run. Curated (built-in) or custom environments from the workspace can be used.
-   `instance_count` - Number of nodes. Default is 1.
-   `distribution` - Distribution configuration for distributed training scenarios. Azure Machine Learning supports PyTorch, TensorFlow, and MPI-based distributed.


In [None]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output
from azure.ai.ml.entities import ResourceConfiguration
from utils.aml_common import get_num_gpus
from azure.ai.ml.entities import Model

num_gpu = get_num_gpus(azure_compute_cluster_size)
logger.info(f"Number of GPUs={num_gpu}")

str_command = ""
if USE_BUILTIN_ENV:
    str_env = "azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/versions/26"  # Use built-in Environment asset
    str_command += "pip install -r requirements.txt && "
else:
    str_env = f"{azure_env_name}@latest"  # Use Curated (built-in) Environment asset

if num_gpu > 1:
    tune_recipe = "lora_finetune_distributed"
    str_command += "python launcher_distributed.py "
else:
    tune_recipe = "lora_finetune_single_device"
    str_command += "python launcher_single.py "

inputs_dict = dict(
    # train_dir=Input(type="uri_folder", path=SFT_DATA_DIR), # Get data from local path
    train_dir=Input(path=f"{AZURE_SFT_DATA_NAME}@latest"),  # Get data from Data asset
    hf_token=HF_TOKEN,
    tune_recipe=tune_recipe,
    tune_action="fine-tune",
    model_id=HF_MODEL_NAME_OR_PATH,
    model_dir="./model",
    log_dir="./outputs/log",
    model_output_dir="./outputs",
    tune_finetune_yaml="lora_finetune_phi4.yaml",  # YOU CAN CHANGE THIS TO YOUR OWN CONFIG FILE
    tune_eval_yaml="evaluation_phi4.yaml",  # YOU CAN CHANGE THIS TO YOUR OWN CONFIG FILE
    tune_quant_yaml="quant_phi4.yaml",  # YOU CAN CHANGE THIS TO YOUR OWN CONFIG FILE
)

if len(wandb_api_key) > 0:
    str_command += "--wandb_api_key ${{inputs.wandb_api_key}} \
            --wandb_project ${{inputs.wandb_project}} \
            --wandb_watch ${{inputs.wandb_watch}} "
    inputs_dict["wandb_api_key"] = wandb_api_key
    inputs_dict["wandb_project"] = wandb_project
    inputs_dict["wandb_watch"] = wandb_watch

str_command += "--train_dir ${{inputs.train_dir}} \
            --hf_token ${{inputs.hf_token}} \
            --tune_recipe ${{inputs.tune_recipe}} \
            --tune_action ${{inputs.tune_action}} \
            --model_id ${{inputs.model_id}} \
            --model_dir ${{inputs.model_dir}} \
            --log_dir ${{inputs.log_dir}} \
            --model_output_dir ${{inputs.model_output_dir}} \
            --tune_finetune_yaml ${{inputs.tune_finetune_yaml}} \
            --tune_eval_yaml ${{inputs.tune_eval_yaml}} \
            --tune_quant_yaml ${{inputs.tune_quant_yaml}}"

logger.info(f"Tune recipe: {tune_recipe}")

job = command(
    inputs=inputs_dict,
    code="./scripts",  # local path where the code is stored
    compute=azure_compute_cluster_name,
    command=str_command,
    environment=str_env,
    instance_count=1,
    distribution={
        "type": "PyTorch",
        "process_count_per_instance": num_gpu,  # For multi-gpu training set this to an integer value more than 1
    },
)

returned_job = ml_client.jobs.create_or_update(job)
logger.info(
    """Started training job. Now a dedicated Compute Cluster for training is provisioned and the environment
required for training is automatically set up from Environment.

If you have set up a new custom Environment, it will take approximately 20 minutes or more to set up the Environment before provisioning the training cluster.
"""
)
ml_client.jobs.stream(returned_job.name)

In [None]:
display(returned_job)

In [None]:
# check if the `trained_model` output is available
job_name = returned_job.name

In [None]:
%store job_name

<br>

## 4. (Optional) Create model asset and get fine-tuned LLM to local folder

---

### 3.1. Create model asset


In [None]:
from utils.aml_common import get_or_create_model_asset

azure_model_name = d["serve"]["azure_model_name"]
model_dir = d["train"]["model_dir"]
model = get_or_create_model_asset(
    ml_client,
    azure_model_name,
    job_name,
    model_dir,
    model_type="custom_model",
    download_quantized_model_only=True,
    update=False,
)

logger.info(
    "===== 4. (Optional) Create model asset and get fine-tuned LLM to local folder ====="
)
logger.info(f"azure_model_name={azure_model_name}")
logger.info(f"model_dir={model_dir}")
logger.info(f"model={model}")

### 3.2. Get fine-tuned LLM to local folder

You can copy it to your local directory to perform inference or serve the model in Azure environment. (e.g., real-time endpoint)


In [None]:
# Download the model (this is optional)
DOWNLOAD_TO_LOCAL = False
local_model_dir = "./artifact_downloads_sft"

if DOWNLOAD_TO_LOCAL:
    os.makedirs(local_model_dir, exist_ok=True)

    ml_client.models.download(
        name=azure_model_name, download_path=local_model_dir, version=model.version
    )

## Clean up


In [None]:
!rm -rf $SFT_DATA_DIR $DPO_DATA_DIR {local_model_dir}