local_inference/awq.ipynb (561 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "ycrm7hWWxYoX"
},
"source": [
"# Run Llama 3.1 8B Instruct with < 5GB VRAM!\n",
"\n",
"Powered by Transformers & AutoAWQ\n",
"\n",
"[Model Checkpoint](https://huggingface.co/hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4)\n",
"\n",
"Note: Whilst we use only 8B Instruct checkpoint in this example, you can use the same code base for any Llama 3.1 model checkpoint like 70B, 405B (& fine-tune) as well!\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PCzvqFRoyGcM"
},
"source": [
"## Setup Environment\n",
"\n",
"Since Llama 3.1 comes with minor modeling changes (primarily RoPE scaling), we'll need to make sure that we're on the latest version of transformers."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "u5hdTjYoYHqn"
},
"outputs": [],
"source": [
"!pip install -q --upgrade transformers autoawq accelerate"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fO093ZGCynnm"
},
"source": [
"## Load Tokenizer and Model checkpoint"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 94,
"referenced_widgets": [
"9033b144293e49c2b85e0275ff344989",
"48ee991858974a53a163788d52acfa3e",
"69e6b11c2af24fc4a160970a7acf101f",
"24aef294d97b4d20b2a395ed168391a4",
"b3a1e8193ed14ce0bb27deae89f3713f",
"2341ac68838a4b90b9e0ed0190221081",
"8fab183cf59042c28f549b1ddda6c249",
"0c40b3b7609942d3b8ae37df4cbe859b",
"00a9d4213d964811a463180f49584916",
"ab2f9d33831944988c6537ca448ffe7d",
"9c5e196328004b1eb3e196ee14067ab0"
]
},
"id": "OLFqj9b6YW5n",
"outputId": "3ae972f5-df52-46f6-802b-9148bd60b7c0"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9033b144293e49c2b85e0275ff344989",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import torch\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig\n",
"\n",
"model_id = \"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4\"\n",
"\n",
"quantization_config = AwqConfig(\n",
" bits=4,\n",
" fuse_max_seq_len=512, # Note: Update this as per your use-case\n",
" do_fuse=True,\n",
")\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" model_id,\n",
" torch_dtype=torch.float16,\n",
" low_cpu_mem_usage=True,\n",
" quantization_config=quantization_config\n",
").to(\"cuda\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sck3FWKpy11A"
},
"source": [
"# Define Prompt & Tokenize"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "HDS8InY0y0WC"
},
"outputs": [],
"source": [
"prompt = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant, that responds as a pirate.\"},\n",
" {\"role\": \"user\", \"content\": \"What's Deep Learning?\"},\n",
"]\n",
"\n",
"inputs = tokenizer.apply_chat_template(\n",
" prompt,\n",
" tokenize=True,\n",
" add_generation_prompt=True,\n",
" return_tensors=\"pt\",\n",
" return_dict=True,\n",
").to(\"cuda\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0veYFyBOzAmG"
},
"source": [
"# Generate"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QslxgBSgy7w_",
"outputId": "e734b793-ac46-4474-9be4-2fb75545ff19"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\"system\\n\\nYou are a helpful assistant, that responds as a pirate.user\\n\\nWhat's Deep Learning?assistant\\n\\nArrr, ye landlubber! Ye be askin' about Deep Learnin', eh? Well, matey,\"]\n"
]
}
],
"source": [
"outputs = model.generate(**inputs, do_sample=True, max_new_tokens=25)\n",
"print(tokenizer.batch_decode(outputs, skip_special_tokens=True))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "o__5S-lCzDo3"
},
"source": [
"# Voila! You now have a smart and capable assistant! 🦙"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"00a9d4213d964811a463180f49584916": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "ProgressStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"0c40b3b7609942d3b8ae37df4cbe859b": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"2341ac68838a4b90b9e0ed0190221081": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"24aef294d97b4d20b2a395ed168391a4": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_ab2f9d33831944988c6537ca448ffe7d",
"placeholder": "​",
"style": "IPY_MODEL_9c5e196328004b1eb3e196ee14067ab0",
"value": " 2/2 [00:01<00:00,  1.45it/s]"
}
},
"48ee991858974a53a163788d52acfa3e": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_2341ac68838a4b90b9e0ed0190221081",
"placeholder": "​",
"style": "IPY_MODEL_8fab183cf59042c28f549b1ddda6c249",
"value": "Loading checkpoint shards: 100%"
}
},
"69e6b11c2af24fc4a160970a7acf101f": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "FloatProgressModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_0c40b3b7609942d3b8ae37df4cbe859b",
"max": 2,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_00a9d4213d964811a463180f49584916",
"value": 2
}
},
"8fab183cf59042c28f549b1ddda6c249": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"9033b144293e49c2b85e0275ff344989": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HBoxModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_48ee991858974a53a163788d52acfa3e",
"IPY_MODEL_69e6b11c2af24fc4a160970a7acf101f",
"IPY_MODEL_24aef294d97b4d20b2a395ed168391a4"
],
"layout": "IPY_MODEL_b3a1e8193ed14ce0bb27deae89f3713f"
}
},
"9c5e196328004b1eb3e196ee14067ab0": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"ab2f9d33831944988c6537ca448ffe7d": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b3a1e8193ed14ce0bb27deae89f3713f": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
}
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}