Purpose of this notebook is to use LORA (aka Low Rank Adaptation method) and finetune

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (AutoTokenizer,
                         AutoConfig,
                         AutoModelForSequenceClassification,
                         DataCollatorWithPadding,
                         TrainingArguments,
                         Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'mps'
print(device)

#### Base Model (DistillBERT)

In [None]:
model_checkpoint = 'distilbert-base-uncased'
id2label = {0: 'information_intent',
            1: 'yelp_intent',
            2: 'navigation_intent',
            3: 'travel_intent',
            4: 'purchase_intent',
            5: 'weather_intent',
            6: 'translation_intent',
            7: 'unknown'}
label2id = {label:id for id,label in id2label.items()}


# generate classification model from model chckpoints
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=8,
    id2label=id2label,
    label2id=label2id
)

In [None]:
model

#### Load dataset

In [None]:
df = pd.read_csv("../data/marco_train.csv")
print(len(df))
print(df['target'].value_counts())
df.head(10)

In [None]:
print(df['sequence'].apply(len).describe(percentiles=[.1, .2, .25, .3, .4, .5, .6, .7, .75, .8, .9, .95, .98, .99, .995, .998, .999]))
df['sequence'].apply(len).hist(bins=25);

In [None]:
pd.set_option('display.max_colwidth', 100)
df.loc[df['sequence'].apply(lambda text: len(text) > 64)]

In [None]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

token_lengths = []
for sequence in df['sequence'].values:
    tokens = tokenizer(sequence, truncation=False)['input_ids']  # Get tokenized input IDs
    token_lengths.append(len(tokens))

# Create a DataFrame for analysis
temp_df = pd.DataFrame({'sequence': df['sequence'].values, 'token_length': token_lengths})

# Display token lengths
# print(temp_df)

# Optional: Analyze token lengths for deciding the best max_length
print(f"Max token length: {temp_df['token_length'].max()}")
print(f"Average token length: {temp_df['token_length'].mean()}")
print(f"90th percentile token length: {temp_df['token_length'].quantile(0.9)}")
print(f"95th percentile token length: {temp_df['token_length'].quantile(0.95)}")
print(f"98th percentile token length: {temp_df['token_length'].quantile(0.98)}")
print(f"99th percentile token length: {temp_df['token_length'].quantile(0.99)}")
print(f"99.5th percentile token length: {temp_df['token_length'].quantile(0.995)}")
print(f"99.9th percentile token length: {temp_df['token_length'].quantile(0.999)}")

del temp_df

In [None]:
# Select only a sample from the actual data

sampling_percentages = {
    'information_intent': 0.15,  # 15% sampling for information_intent
    'yelp_intent': 1.0,          # 100% sampling for yelp_intent
    'weather_intent': 1.0,       # 100% sampling for weather_intent
    'navigation_intent': 1.0,    # 100% sampling for navigation_intent
    'purchase_intent': 1.0,      # 100% sampling for purchase_intent
    'translation_intent': 1.0,   # 100% sampling for translation_intent
    'travel_intent': 1.0,        # 100% sampling for travel_intent
    'unknown': 1.0               # 100% sampling for unknown
}

# Sample from each target group based on the defined percentages
sampled_df = df.groupby('target', group_keys=False).apply(
    lambda x: x.sample(frac=sampling_percentages.get(x.name, 1.0))
).reset_index(drop=True)

sampled_df['label'] = sampled_df['target'].map(label2id)
# sampled_df = sampled_df.rename(columns={'target': 'label'})

print(sampled_df['label'].value_counts())
print(f"Size of sampled_df = {len(sampled_df)}")
sampled_df.head()

In [None]:
# Step 1: Split the DataFrame into train and validation sets
train_df, val_df = train_test_split(sampled_df, test_size=0.05, random_state=42, stratify=sampled_df['label'])

# Step 2: Convert Pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)

# Step 3: Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

# Step 4: Verify the structure of DatasetDict
print(dataset_dict)

In [None]:
train_df['label'].value_counts()

In [None]:
val_df['label'].value_counts()

#### Preprocess data

In [None]:


# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["sequence"]

    # tokenize and truncate text
    tokenizer.truncation_side = "right"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,  # Pad the sequences to the longest in the batch
        max_length=64
    )
    return tokenized_inputs

# def fix_labels(examples):
#     examples["idx"] = int(examples["idx"])
#     return examples

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_dataset = dataset_dict.map(tokenize_function, batched=True)
# tokenized_dataset = tokenized_dataset.map(fix_labels)
tokenized_dataset

#### Evaluation Metrics

In [None]:
def compute_metrics(p):
    logits, labels = p
    predictions = np.argmax(logits, axis=1)
    # Combine metrics
    accuracy_metric = evaluate.load("accuracy")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")

    # Calculate metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }

In [None]:
### Evaluate untrained model

text_list = [
    'floor repair cost', 
    'denture fix', 
    'who is the us president', 
    'italian food', 
    'sandwiches in seattle',
]

sample_labels = [
    label2id["yelp_intent"],
    label2id["yelp_intent"],
    label2id["information_intent"],
    label2id["yelp_intent"],
    label2id["yelp_intent"]
]

print("Untrained model predictions:")
print("----------------------------")
predictions = []
logits_list = []
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    prediction = torch.argmax(logits, dim=1).item()
    predictions.append(prediction)
    print(text + " -> " + id2label[prediction])

In [None]:
# compute_metrics((logits_list, sample_labels))

#### Model finetuning with LoRA

In [None]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                         r=4, # intrinsic rank of trainable weight matrix
                         lora_alpha=32, # similar to learning_rate
                         lora_dropout=0.01, # probability of dropout nodes
                         target_modules=['q_lin']) # LoRA is applied to query layer

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
# for name, module in model.named_modules():
#     print(name)

#### Define hyper parameters and training arguments

In [None]:
lr = 1e-3
batch_size = 4
num_epochs = 10

# training args
training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-intent-classification-v2",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
# data_collator.tokenizer

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args, # Hyperparamaters
    train_dataset=tokenized_dataset["train"], # training data
    eval_dataset=tokenized_dataset["validation"], # validation data
    tokenizer=tokenizer, # tokenizer
    data_collator=data_collator, # dynamic sequence padding
    compute_metrics=compute_metrics,  # model perfomance evaluation metric
)


In [None]:
trainer.train()

In [None]:
trainer.model

In [None]:
trainer.model.eval()
with torch.no_grad():
    for text in text_list:
        inputs = tokenizer.encode(text, return_tensors="pt").to(device)
        logits = trainer.model(inputs).logits
        prediction = torch.argmax(logits, dim=1).item()
        print(text + " -> " + id2label[prediction])

In [None]:
# !ls -ltr distilbert-base-uncased-lora-intent-classification/checkpoint-15048
!ls -lh distilbert-base-uncased-lora-intent-classification-v2/checkpoint-67716

In [None]:
# !ls -lh "distilbert-base-uncased-lora-intent-classification/checkpoint-15048/adapter_model.safetensors"

In [None]:
# !python -m pip install onnx onnxruntime optimum

#### Load the LoRA model from checkpoint after training

In [None]:
id2label = {0: 'information_intent',
            1: 'yelp_intent',
            2: 'navigation_intent',
            3: 'travel_intent',
            4: 'purchase_intent',
            5: 'weather_intent',
            6: 'translation_intent',
            7: 'unknown'}
label2id = {label:id for id,label in id2label.items()}


# output_dir="distilbert-base-uncased-lora-intent-classification/checkpoint-37620"
output_dir = "distilbert-base-uncased-lora-intent-classification-v2/checkpoint-67716"
# Load the tokenizer (from the output directory)
tokenizer = AutoTokenizer.from_pretrained(output_dir, return_tensors="pt", padding="max_length", truncation=True, max_length=64)

# Load the base model from the original checkpoint (base pre-trained model)
base_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8, id2label=id2label, label2id=label2id)

# Load the LoRA configuration and model
peft_config = PeftConfig.from_pretrained(output_dir)
lora_model = PeftModel.from_pretrained(base_model, output_dir)

# Step 3: Save the combined model to a directory
save_directory = "tmp/lora_combined_model/"
lora_model.save_pretrained(save_directory)  # Save base model + LoRA weights

# Now the `lora_model` contains both the base model and the LoRA weights.
lora_model.eval()

# Example inference
inputs = tokenizer(["looking for home cleaning "], return_tensors="pt")
outputs = lora_model(**inputs)
logits = outputs.logits
print(logits)


prediction = torch.argmax(logits, dim=1).item()
print(prediction, id2label[prediction])
probabilities = torch.softmax(logits, dim=1)
rounded_probabilities = torch.round(probabilities)
print(rounded_probabilities)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig

# Step 1: Load the base model (DistilBERT)
base_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8, id2label=id2label, label2id=label2id)

# Step 2: Load the LoRA adapter weights
# output_dir = "distilbert-base-uncased-lora-intent-classification/checkpoint-37620"
output_dir = "distilbert-base-uncased-lora-intent-classification-v2/checkpoint-67716"
peft_config = PeftConfig.from_pretrained(output_dir)
lora_model = PeftModel.from_pretrained(base_model, output_dir)

# Step 3: Merge LoRA weights into the base model
# After this, the model will have both base and LoRA weights applied
base_model = lora_model.merge_and_unload()

# Step 4: Save the full model (base model + LoRA weights)
save_directory = "tmp/lora_combined_model/"
base_model.save_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(output_dir)  # Load the tokenizer
tokenizer.save_pretrained(save_directory)  # Save the tokenizer


In [None]:
!ls -ltr tmp/lora_combined_model/

In [None]:
base_model

In [None]:
save_directory

In [None]:
!python -m pip install --upgrade huggingface_hub
!python -m pip install ipywidgets

In [None]:
from huggingface_hub import login
# login("",write_permission=True)

In [None]:
from huggingface_hub import HfApi
api = HfApi()

In [None]:
# api.upload_file(
#     path_or_fileobj="/home/jupyter/lora_model_fp16.onnx",
#     path_in_repo="onnx/model_fp16.onnx",
#     repo_id="chidamnat2002/intent_classifier",
#     repo_type="model",
# )

In [None]:
# api.upload_file(
#     path_or_fileobj="tmp/lora_combined_model/model.safetensors",
#     path_in_repo="model.safetensors",
#     repo_id="chidamnat2002/intent_classifier",
#     repo_type="model",
# )