notebooks/archives/suggest_intent_dl

{ "cells": [ { "cell_type": "markdown", "id": "83df52ac-a343-49ec-83bc-16b8c91b9dd0", "metadata": {}, "source": [ "Purpose of this notebook is to use LORA (aka Low Rank Adaptation method) and finetune" ] }, { "cell_type": "code", "execution_count": null, "id": "96c683dd-30bd-448a-ada9-bc4562b61663", "metadata": { "tags": [] }, "outputs": [], "source": [ "from datasets import load_dataset, DatasetDict, Dataset\n", "\n", "from transformers import (AutoTokenizer,\n", " AutoConfig,\n", " AutoModelForSequenceClassification,\n", " DataCollatorWithPadding,\n", " TrainingArguments,\n", " Trainer)\n", "\n", "from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig\n", "import evaluate\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": null, "id": "f32ae2ac-7f08-4fc4-baa7-10a7ccbcf800", "metadata": { "tags": [] }, "outputs": [], "source": [ "from torch import cuda\n", "device = 'cuda' if cuda.is_available() else 'mps'\n", "print(device)" ] }, { "cell_type": "markdown", "id": "82e512cf-4776-4278-9612-ec5c8be44f80", "metadata": {}, "source": [ "#### Base Model (DistillBERT)" ] }, { "cell_type": "code", "execution_count": null, "id": "e1dc1475-55e2-42ca-a881-8f33e475f41c", "metadata": { "tags": [] }, "outputs": [], "source": [ "model_checkpoint = 'distilbert-base-uncased'\n", "id2label = {0: 'information_intent',\n", " 1: 'yelp_intent',\n", " 2: 'navigation_intent',\n", " 3: 'travel_intent',\n", " 4: 'purchase_intent',\n", " 5: 'weather_intent',\n", " 6: 'translation_intent',\n", " 7: 'unknown'}\n", "label2id = {label:id for id,label in id2label.items()}\n", "\n", "\n", "# generate classification model from model chckpoints\n", "model = AutoModelForSequenceClassification.from_pretrained(\n", " model_checkpoint,\n", " num_labels=8,\n", " id2label=id2label,\n", " label2id=label2id\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "57a893d3-2bf9-45cb-b678-436748405b94", "metadata": { "tags": [] }, "outputs": [], "source": [ "model" ] }, { "cell_type": "markdown", "id": "bbeaa99b-10c2-44ed-8d21-08ed2b3990bd", "metadata": {}, "source": [ "#### Load dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "81eb7dac-565f-4247-b42d-fe2d7675ec60", "metadata": { "tags": [] }, "outputs": [], "source": [ "df = pd.read_csv(\"../data/marco_train.csv\")\n", "print(len(df))\n", "print(df['target'].value_counts())\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": null, "id": "bb24484d-7380-49b6-b67f-5b645e46cf49", "metadata": { "tags": [] }, "outputs": [], "source": [ "print(df['sequence'].apply(len).describe(percentiles=[.1, .2, .25, .3, .4, .5, .6, .7, .75, .8, .9, .95, .98, .99, .995, .998, .999]))\n", "df['sequence'].apply(len).hist(bins=25);" ] }, { "cell_type": "code", "execution_count": null, "id": "082fb996-037b-4353-8739-7047d6142f69", "metadata": { "tags": [] }, "outputs": [], "source": [ "pd.set_option('display.max_colwidth', 100)\n", "df.loc[df['sequence'].apply(lambda text: len(text) > 64)]" ] }, { "cell_type": "code", "execution_count": null, "id": "b7e0d5f3-b927-477e-ac5b-5e05fce4835b", "metadata": { "tags": [] }, "outputs": [], "source": [ "# create tokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)\n", "\n", "token_lengths = []\n", "for sequence in df['sequence'].values:\n", " tokens = tokenizer(sequence, truncation=False)['input_ids'] # Get tokenized input IDs\n", " token_lengths.append(len(tokens))\n", "\n", "# Create a DataFrame for analysis\n", "temp_df = pd.DataFrame({'sequence': df['sequence'].values, 'token_length': token_lengths})\n", "\n", "# Display token lengths\n", "# print(temp_df)\n", "\n", "# Optional: Analyze token lengths for deciding the best max_length\n", "print(f\"Max token length: {temp_df['token_length'].max()}\")\n", "print(f\"Average token length: {temp_df['token_length'].mean()}\")\n", "print(f\"90th percentile token length: {temp_df['token_length'].quantile(0.9)}\")\n", "print(f\"95th percentile token length: {temp_df['token_length'].quantile(0.95)}\")\n", "print(f\"98th percentile token length: {temp_df['token_length'].quantile(0.98)}\")\n", "print(f\"99th percentile token length: {temp_df['token_length'].quantile(0.99)}\")\n", "print(f\"99.5th percentile token length: {temp_df['token_length'].quantile(0.995)}\")\n", "print(f\"99.9th percentile token length: {temp_df['token_length'].quantile(0.999)}\")\n", "\n", "del temp_df" ] }, { "cell_type": "code", "execution_count": null, "id": "13e6c934-fdc6-4d28-9c87-69eec74e5beb", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Select only a sample from the actual data\n", "\n", "sampling_percentages = {\n", " 'information_intent': 0.15, # 15% sampling for information_intent\n", " 'yelp_intent': 1.0, # 100% sampling for yelp_intent\n", " 'weather_intent': 1.0, # 100% sampling for weather_intent\n", " 'navigation_intent': 1.0, # 100% sampling for navigation_intent\n", " 'purchase_intent': 1.0, # 100% sampling for purchase_intent\n", " 'translation_intent': 1.0, # 100% sampling for translation_intent\n", " 'travel_intent': 1.0, # 100% sampling for travel_intent\n", " 'unknown': 1.0 # 100% sampling for unknown\n", "}\n", "\n", "# Sample from each target group based on the defined percentages\n", "sampled_df = df.groupby('target', group_keys=False).apply(\n", " lambda x: x.sample(frac=sampling_percentages.get(x.name, 1.0))\n", ").reset_index(drop=True)\n", "\n", "sampled_df['label'] = sampled_df['target'].map(label2id)\n", "# sampled_df = sampled_df.rename(columns={'target': 'label'})\n", "\n", "print(sampled_df['label'].value_counts())\n", "print(f\"Size of sampled_df = {len(sampled_df)}\")\n", "sampled_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "8dadb9a2-6184-4e50-bf21-8063e31edbb1", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Step 1: Split the DataFrame into train and validation sets\n", "train_df, val_df = train_test_split(sampled_df, test_size=0.05, random_state=42, stratify=sampled_df['label'])\n", "\n", "# Step 2: Convert Pandas DataFrames to Hugging Face Datasets\n", "train_dataset = Dataset.from_pandas(train_df, preserve_index=False)\n", "val_dataset = Dataset.from_pandas(val_df, preserve_index=False)\n", "\n", "# Step 3: Create a DatasetDict\n", "dataset_dict = DatasetDict({\n", " 'train': train_dataset,\n", " 'validation': val_dataset\n", "})\n", "\n", "# Step 4: Verify the structure of DatasetDict\n", "print(dataset_dict)" ] }, { "cell_type": "code", "execution_count": null, "id": "0613b1e7-e4ec-4b18-91a2-2648c42a91b6", "metadata": { "tags": [] }, "outputs": [], "source": [ "train_df['label'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "68d476d6-4aa2-4180-889b-49f8e5251219", "metadata": { "tags": [] }, "outputs": [], "source": [ "val_df['label'].value_counts()" ] }, { "cell_type": "markdown", "id": "68ae60f7-3f28-4d95-bee7-28d05b68566f", "metadata": {}, "source": [ "#### Preprocess data" ] }, { "cell_type": "code", "execution_count": null, "id": "17cb2d99-4f2f-4caa-8f1f-747fbc9c9082", "metadata": { "tags": [] }, "outputs": [], "source": [ "\n", "\n", "# add pad token if none exists\n", "if tokenizer.pad_token is None:\n", " tokenizer.add_special_tokens({'pad_token': '[PAD]'})\n", " model.resize_token_embeddings(len(tokenizer))\n", "\n", "# tokenize function\n", "def tokenize_function(examples):\n", " # extract text\n", " text = examples[\"sequence\"]\n", "\n", " # tokenize and truncate text\n", " tokenizer.truncation_side = \"right\"\n", " tokenized_inputs = tokenizer(\n", " text,\n", " return_tensors=\"pt\",\n", " truncation=True,\n", " padding=True, # Pad the sequences to the longest in the batch\n", " max_length=64\n", " )\n", " return tokenized_inputs\n", "\n", "# def fix_labels(examples):\n", "# examples[\"idx\"] = int(examples[\"idx\"])\n", "# return examples\n", "\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": null, "id": "d52d8863-7705-40d2-a6b4-26b1865f5c1c", "metadata": { "tags": [] }, "outputs": [], "source": [ "tokenized_dataset = dataset_dict.map(tokenize_function, batched=True)\n", "# tokenized_dataset = tokenized_dataset.map(fix_labels)\n", "tokenized_dataset" ] }, { "cell_type": "markdown", "id": "56166c5b-958c-4921-961a-7e42c1ef37d7", "metadata": {}, "source": [ "#### Evaluation Metrics" ] }, { "cell_type": "code", "execution_count": null, "id": "d481d862-dfe6-4635-9d0c-7c827a2155f9", "metadata": { "tags": [] }, "outputs": [], "source": [ "def compute_metrics(p):\n", " logits, labels = p\n", " predictions = np.argmax(logits, axis=1)\n", " # Combine metrics\n", " accuracy_metric = evaluate.load(\"accuracy\")\n", " precision_metric = evaluate.load(\"precision\")\n", " recall_metric = evaluate.load(\"recall\")\n", " f1_metric = evaluate.load(\"f1\")\n", "\n", " # Calculate metrics\n", " accuracy = accuracy_metric.compute(predictions=predictions, references=labels)\n", " precision = precision_metric.compute(predictions=predictions, references=labels, average=\"weighted\")\n", " recall = recall_metric.compute(predictions=predictions, references=labels, average=\"weighted\")\n", " f1 = f1_metric.compute(predictions=predictions, references=labels, average=\"weighted\")\n", "\n", " return {\n", " \"accuracy\": accuracy[\"accuracy\"],\n", " \"precision\": precision[\"precision\"],\n", " \"recall\": recall[\"recall\"],\n", " \"f1\": f1[\"f1\"]\n", " }" ] }, { "cell_type": "code", "execution_count": null, "id": "f08d8464-0714-4021-a266-9ceb64f36f0b", "metadata": { "tags": [] }, "outputs": [], "source": [ "### Evaluate untrained model\n", "\n", "text_list = [\n", " 'floor repair cost', \n", " 'denture fix', \n", " 'who is the us president', \n", " 'italian food', \n", " 'sandwiches in seattle',\n", "]\n", "\n", "sample_labels = [\n", " label2id[\"yelp_intent\"],\n", " label2id[\"yelp_intent\"],\n", " label2id[\"information_intent\"],\n", " label2id[\"yelp_intent\"],\n", " label2id[\"yelp_intent\"]\n", "]\n", "\n", "print(\"Untrained model predictions:\")\n", "print(\"----------------------------\")\n", "predictions = []\n", "logits_list = []\n", "for text in text_list:\n", " inputs = tokenizer.encode(text, return_tensors=\"pt\")\n", " logits = model(inputs).logits\n", " prediction = torch.argmax(logits, dim=1).item()\n", " predictions.append(prediction)\n", " print(text + \" -> \" + id2label[prediction])" ] }, { "cell_type": "code", "execution_count": null, "id": "735012ed-4d2c-4800-815c-7bd7238524ed", "metadata": { "tags": [] }, "outputs": [], "source": [ "# compute_metrics((logits_list, sample_labels))" ] }, { "cell_type": "markdown", "id": "82bfb5ad-391d-42da-bcf5-d62e39642225", "metadata": {}, "source": [ "#### Model finetuning with LoRA" ] }, { "cell_type": "code", "execution_count": null, "id": "cece8cd8-b017-4f80-9d9a-713041270fc2", "metadata": { "tags": [] }, "outputs": [], "source": [ "peft_config = LoraConfig(task_type=\"SEQ_CLS\",\n", " r=4, # intrinsic rank of trainable weight matrix\n", " lora_alpha=32, # similar to learning_rate\n", " lora_dropout=0.01, # probability of dropout nodes\n", " target_modules=['q_lin']) # LoRA is applied to query layer\n", "\n", "model = get_peft_model(model, peft_config)\n", "model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": null, "id": "93c9ac1c-cd57-4173-b208-78831d9c4f99", "metadata": {}, "outputs": [], "source": [ "# for name, module in model.named_modules():\n", "# print(name)" ] }, { "cell_type": "markdown", "id": "66ccdcde-6b2c-499d-a185-d04dbd1b6ba7", "metadata": {}, "source": [ "#### Define hyper parameters and training arguments" ] }, { "cell_type": "code", "execution_count": null, "id": "84297802-ae8a-4fef-882a-7d7145de72bf", "metadata": { "tags": [] }, "outputs": [], "source": [ "lr = 1e-3\n", "batch_size = 4\n", "num_epochs = 10\n", "\n", "# training args\n", "training_args = TrainingArguments(\n", " output_dir=model_checkpoint + \"-lora-intent-classification-v2\",\n", " learning_rate=lr,\n", " per_device_train_batch_size=batch_size,\n", " per_device_eval_batch_size=batch_size,\n", " num_train_epochs=num_epochs,\n", " weight_decay=0.01,\n", " eval_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " load_best_model_at_end=True,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "39930440-66ab-49a4-a198-04d03bb576ec", "metadata": { "tags": [] }, "outputs": [], "source": [ "# data_collator.tokenizer" ] }, { "cell_type": "code", "execution_count": null, "id": "615cdb83-edc7-409a-a929-6c370c8aa9e1", "metadata": { "tags": [] }, "outputs": [], "source": [ "trainer = Trainer(\n", " model=model, \n", " args=training_args, # Hyperparamaters\n", " train_dataset=tokenized_dataset[\"train\"], # training data\n", " eval_dataset=tokenized_dataset[\"validation\"], # validation data\n", " tokenizer=tokenizer, # tokenizer\n", " data_collator=data_collator, # dynamic sequence padding\n", " compute_metrics=compute_metrics, # model perfomance evaluation metric\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f9fde457-1a0d-4552-b6fa-1c0642349972", "metadata": { "tags": [] }, "outputs": [], "source": [ "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "id": "1da3c5f1-ef7f-4ca1-9792-4879b621f267", "metadata": { "tags": [] }, "outputs": [], "source": [ "trainer.model" ] }, { "cell_type": "code", "execution_count": null, "id": "15d874c9-e652-4c17-8a54-11d94710debb", "metadata": { "tags": [] }, "outputs": [], "source": [ "trainer.model.eval()\n", "with torch.no_grad():\n", " for text in text_list:\n", " inputs = tokenizer.encode(text, return_tensors=\"pt\").to(device)\n", " logits = trainer.model(inputs).logits\n", " prediction = torch.argmax(logits, dim=1).item()\n", " print(text + \" -> \" + id2label[prediction])" ] }, { "cell_type": "code", "execution_count": null, "id": "66aae059-b207-43d5-8a78-a76901550c8a", "metadata": { "tags": [] }, "outputs": [], "source": [ "# !ls -ltr distilbert-base-uncased-lora-intent-classification/checkpoint-15048\n", "!ls -lh distilbert-base-uncased-lora-intent-classification-v2/checkpoint-67716" ] }, { "cell_type": "code", "execution_count": null, "id": "6557a796-1ba4-455d-9c72-acb52f8958c3", "metadata": { "tags": [] }, "outputs": [], "source": [ "# !ls -lh \"distilbert-base-uncased-lora-intent-classification/checkpoint-15048/adapter_model.safetensors\"" ] }, { "cell_type": "code", "execution_count": null, "id": "34a5d900-a6f7-446c-b608-3beb4104b850", "metadata": { "tags": [] }, "outputs": [], "source": [ "# !python -m pip install onnx onnxruntime optimum" ] }, { "cell_type": "markdown", "id": "0c08157f-ff0c-46a3-aa0d-f04493a71d36", "metadata": {}, "source": [ "#### Load the LoRA model from checkpoint after training" ] }, { "cell_type": "code", "execution_count": null, "id": "041cee58-787b-430c-a9c4-c4aadcf87a58", "metadata": { "tags": [] }, "outputs": [], "source": [ "id2label = {0: 'information_intent',\n", " 1: 'yelp_intent',\n", " 2: 'navigation_intent',\n", " 3: 'travel_intent',\n", " 4: 'purchase_intent',\n", " 5: 'weather_intent',\n", " 6: 'translation_intent',\n", " 7: 'unknown'}\n", "label2id = {label:id for id,label in id2label.items()}\n", "\n", "\n", "# output_dir=\"distilbert-base-uncased-lora-intent-classification/checkpoint-37620\"\n", "output_dir = \"distilbert-base-uncased-lora-intent-classification-v2/checkpoint-67716\"\n", "# Load the tokenizer (from the output directory)\n", "tokenizer = AutoTokenizer.from_pretrained(output_dir, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=64)\n", "\n", "# Load the base model from the original checkpoint (base pre-trained model)\n", "base_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8, id2label=id2label, label2id=label2id)\n", "\n", "# Load the LoRA configuration and model\n", "peft_config = PeftConfig.from_pretrained(output_dir)\n", "lora_model = PeftModel.from_pretrained(base_model, output_dir)\n", "\n", "# Step 3: Save the combined model to a directory\n", "save_directory = \"tmp/lora_combined_model/\"\n", "lora_model.save_pretrained(save_directory) # Save base model + LoRA weights\n", "\n", "# Now the `lora_model` contains both the base model and the LoRA weights.\n", "lora_model.eval()\n", "\n", "# Example inference\n", "inputs = tokenizer([\"looking for home cleaning \"], return_tensors=\"pt\")\n", "outputs = lora_model(**inputs)\n", "logits = outputs.logits\n", "print(logits)\n", "\n", "\n", "prediction = torch.argmax(logits, dim=1).item()\n", "print(prediction, id2label[prediction])\n", "probabilities = torch.softmax(logits, dim=1)\n", "rounded_probabilities = torch.round(probabilities)\n", "print(rounded_probabilities)" ] }, { "cell_type": "code", "execution_count": null, "id": "d5f259a5-9032-4614-b0fc-b6c685ea250e", "metadata": { "tags": [] }, "outputs": [], "source": [ "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n", "from peft import PeftModel, PeftConfig\n", "\n", "# Step 1: Load the base model (DistilBERT)\n", "base_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8, id2label=id2label, label2id=label2id)\n", "\n", "# Step 2: Load the LoRA adapter weights\n", "# output_dir = \"distilbert-base-uncased-lora-intent-classification/checkpoint-37620\"\n", "output_dir = \"distilbert-base-uncased-lora-intent-classification-v2/checkpoint-67716\"\n", "peft_config = PeftConfig.from_pretrained(output_dir)\n", "lora_model = PeftModel.from_pretrained(base_model, output_dir)\n", "\n", "# Step 3: Merge LoRA weights into the base model\n", "# After this, the model will have both base and LoRA weights applied\n", "base_model = lora_model.merge_and_unload()\n", "\n", "# Step 4: Save the full model (base model + LoRA weights)\n", "save_directory = \"tmp/lora_combined_model/\"\n", "base_model.save_pretrained(save_directory)\n", "tokenizer = AutoTokenizer.from_pretrained(output_dir) # Load the tokenizer\n", "tokenizer.save_pretrained(save_directory) # Save the tokenizer\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ae882ca2-84fe-43a7-8168-7d581229c588", "metadata": { "tags": [] }, "outputs": [], "source": [ "!ls -ltr tmp/lora_combined_model/" ] }, { "cell_type": "code", "execution_count": null, "id": "9d7669bc-3f6d-40d5-a900-8abdd1e19ba6", "metadata": { "tags": [] }, "outputs": [], "source": [ "base_model" ] }, { "cell_type": "code", "execution_count": null, "id": "8822d6d6-052e-4802-b586-89dcc9ebeec4", "metadata": { "tags": [] }, "outputs": [], "source": [ "save_directory" ] }, { "cell_type": "code", "execution_count": null, "id": "f81e9197-2f9b-4a4c-8eac-d1afd815449d", "metadata": { "tags": [] }, "outputs": [], "source": [ "!python -m pip install --upgrade huggingface_hub\n", "!python -m pip install ipywidgets" ] }, { "cell_type": "code", "execution_count": null, "id": "8f608390-2b2a-4589-93e2-29e14b756e46", "metadata": { "tags": [] }, "outputs": [], "source": [ "from huggingface_hub import login\n", "# login(\"\",write_permission=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "b165d243-6e63-4277-b4c9-dc2413731346", "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import HfApi\n", "api = HfApi()" ] }, { "cell_type": "code", "execution_count": null, "id": "de2dbc16-1b59-4213-a2c0-2476bc95b4fb", "metadata": {}, "outputs": [], "source": [ "# api.upload_file(\n", "# path_or_fileobj=\"/home/jupyter/lora_model_fp16.onnx\",\n", "# path_in_repo=\"onnx/model_fp16.onnx\",\n", "# repo_id=\"chidamnat2002/intent_classifier\",\n", "# repo_type=\"model\",\n", "# )" ] }, { "cell_type": "code", "execution_count": null, "id": "7d7bae40-30ed-45c1-a504-548da1f476ac", "metadata": {}, "outputs": [], "source": [ "# api.upload_file(\n", "# path_or_fileobj=\"tmp/lora_combined_model/model.safetensors\",\n", "# path_in_repo=\"model.safetensors\",\n", "# repo_id=\"chidamnat2002/intent_classifier\",\n", "# repo_type=\"model\",\n", "# )" ] }, { "cell_type": "code", "execution_count": null, "id": "6f8e4414-dc64-49cd-bf4b-8b6e27b3c696", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "environment": { "kernel": "my_env", "name": ".m124", "type": "gcloud", "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/:m124" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 5 }

notebooks/archives/suggest_intent_dl_lora.ipynb (840 lines of code) (raw):