course/videos/batch_inputs

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/M6adb1j2jPI?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/M6adb1j2jPI?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]\n", "[1045, 5223, 2023, 1012]\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "sentences = [\n", " \"I've been waiting for a HuggingFace course my whole life.\",\n", " \"I hate this.\",\n", "]\n", "tokens = [tokenizer.tokenize(sentence) for sentence in sentences]\n", "ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]\n", "print(ids[0])\n", "print(ids[1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "expected sequence of length 14 at dim 1 (got 4)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-2-b3483c81e2fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m [1045, 5223, 2023, 1012]]\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0minput_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mValueError\u001b[0m: expected sequence of length 14 at dim 1 (got 4)" ] } ], "source": [ "import torch\n", "\n", "ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n", " [1045, 5223, 2023, 1012]]\n", "\n", "input_ids = torch.tensor(ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "\n", "ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n", " [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n", "\n", "input_ids = torch.tensor(ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "tokenizer.pad_token_id" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[-2.7276, 2.8789]], grad_fn=<AddmmBackward>)\n", "tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)\n", "tensor([[-2.7276, 2.8789],\n", " [ 1.5444, -1.3998]], grad_fn=<AddmmBackward>)\n" ] } ], "source": [ "from transformers import AutoModelForSequenceClassification\n", "\n", "ids1 = torch.tensor(\n", " [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]\n", ")\n", "ids2 = torch.tensor([[1045, 5223, 2023, 1012]])\n", "all_ids = torch.tensor(\n", " [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n", " [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n", ")\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "print(model(ids1).logits)\n", "print(model(ids2).logits)\n", "print(model(all_ids).logits)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_ids = torch.tensor(\n", " [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n", " [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n", ")\n", "attention_mask = torch.tensor(\n", " [[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", " [ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[-2.7276, 2.8789]], grad_fn=<AddmmBackward>)\n", "tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)\n" ] } ], "source": [ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "output1 = model(ids1)\n", "output2 = model(ids2)\n", "print(output1.logits)\n", "print(output2.logits)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[-2.7276, 2.8789],\n", " [ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)\n" ] } ], "source": [ "output = model(all_ids, attention_mask=attention_mask)\n", "print(output.logits)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "sentences = [\n", " \"I've been waiting for a HuggingFace course my whole life.\",\n", " \"I hate this.\",\n", "]\n", "print(tokenizer(sentences, padding=True))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "Batching inputs together (PyTorch)", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }

course/videos/batch_inputs_pt.ipynb (278 lines of code) (raw):