local_inference/awq.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "ycrm7hWWxYoX" }, "source": [ "# Run Llama 3.1 8B Instruct with < 5GB VRAM!\n", "\n", "Powered by Transformers & AutoAWQ\n", "\n", "[Model Checkpoint](https://huggingface.co/hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4)\n", "\n", "Note: Whilst we use only 8B Instruct checkpoint in this example, you can use the same code base for any Llama 3.1 model checkpoint like 70B, 405B (& fine-tune) as well!\n" ] }, { "cell_type": "markdown", "metadata": { "id": "PCzvqFRoyGcM" }, "source": [ "## Setup Environment\n", "\n", "Since Llama 3.1 comes with minor modeling changes (primarily RoPE scaling), we'll need to make sure that we're on the latest version of transformers." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "u5hdTjYoYHqn" }, "outputs": [], "source": [ "!pip install -q --upgrade transformers autoawq accelerate" ] }, { "cell_type": "markdown", "metadata": { "id": "fO093ZGCynnm" }, "source": [ "## Load Tokenizer and Model checkpoint" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 94, "referenced_widgets": [ "9033b144293e49c2b85e0275ff344989", "48ee991858974a53a163788d52acfa3e", "69e6b11c2af24fc4a160970a7acf101f", "24aef294d97b4d20b2a395ed168391a4", "b3a1e8193ed14ce0bb27deae89f3713f", "2341ac68838a4b90b9e0ed0190221081", "8fab183cf59042c28f549b1ddda6c249", "0c40b3b7609942d3b8ae37df4cbe859b", "00a9d4213d964811a463180f49584916", "ab2f9d33831944988c6537ca448ffe7d", "9c5e196328004b1eb3e196ee14067ab0" ] }, "id": "OLFqj9b6YW5n", "outputId": "3ae972f5-df52-46f6-802b-9148bd60b7c0" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9033b144293e49c2b85e0275ff344989", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import torch\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig\n", "\n", "model_id = \"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4\"\n", "\n", "quantization_config = AwqConfig(\n", " bits=4,\n", " fuse_max_seq_len=512, # Note: Update this as per your use-case\n", " do_fuse=True,\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_id,\n", " torch_dtype=torch.float16,\n", " low_cpu_mem_usage=True,\n", " quantization_config=quantization_config\n", ").to(\"cuda\")" ] }, { "cell_type": "markdown", "metadata": { "id": "sck3FWKpy11A" }, "source": [ "# Define Prompt & Tokenize" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "HDS8InY0y0WC" }, "outputs": [], "source": [ "prompt = [\n", " {\"role\": \"system\", \"content\": \"You are a helpful assistant, that responds as a pirate.\"},\n", " {\"role\": \"user\", \"content\": \"What's Deep Learning?\"},\n", "]\n", "\n", "inputs = tokenizer.apply_chat_template(\n", " prompt,\n", " tokenize=True,\n", " add_generation_prompt=True,\n", " return_tensors=\"pt\",\n", " return_dict=True,\n", ").to(\"cuda\")" ] }, { "cell_type": "markdown", "metadata": { "id": "0veYFyBOzAmG" }, "source": [ "# Generate" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QslxgBSgy7w_", "outputId": "e734b793-ac46-4474-9be4-2fb75545ff19" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[\"system\\n\\nYou are a helpful assistant, that responds as a pirate.user\\n\\nWhat's Deep Learning?assistant\\n\\nArrr, ye landlubber! Ye be askin' about Deep Learnin', eh? Well, matey,\"]\n" ] } ], "source": [ "outputs = model.generate(**inputs, do_sample=True, max_new_tokens=25)\n", "print(tokenizer.batch_decode(outputs, skip_special_tokens=True))" ] }, { "cell_type": "markdown", "metadata": { "id": "o__5S-lCzDo3" }, "source": [ "# Voila! You now have a smart and capable assistant! 🦙" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "00a9d4213d964811a463180f49584916": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "0c40b3b7609942d3b8ae37df4cbe859b": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2341ac68838a4b90b9e0ed0190221081": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "24aef294d97b4d20b2a395ed168391a4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ab2f9d33831944988c6537ca448ffe7d", "placeholder": "", "style": "IPY_MODEL_9c5e196328004b1eb3e196ee14067ab0", "value": " 2/2 [00:01<00:00, 1.45it/s]" } }, "48ee991858974a53a163788d52acfa3e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_2341ac68838a4b90b9e0ed0190221081", "placeholder": "", "style": "IPY_MODEL_8fab183cf59042c28f549b1ddda6c249", "value": "Loading checkpoint shards: 100%" } }, "69e6b11c2af24fc4a160970a7acf101f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_0c40b3b7609942d3b8ae37df4cbe859b", "max": 2, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_00a9d4213d964811a463180f49584916", "value": 2 } }, "8fab183cf59042c28f549b1ddda6c249": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9033b144293e49c2b85e0275ff344989": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_48ee991858974a53a163788d52acfa3e", "IPY_MODEL_69e6b11c2af24fc4a160970a7acf101f", "IPY_MODEL_24aef294d97b4d20b2a395ed168391a4" ], "layout": "IPY_MODEL_b3a1e8193ed14ce0bb27deae89f3713f" } }, "9c5e196328004b1eb3e196ee14067ab0": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ab2f9d33831944988c6537ca448ffe7d": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b3a1e8193ed14ce0bb27deae89f3713f": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } } } } }, "nbformat": 4, "nbformat_minor": 0 }

local_inference/awq.ipynb (561 lines of code) (raw):