course/videos/batch_inputs_pt.ipynb (278 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form"
},
"outputs": [
{
"data": {
"text/html": [
"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/M6adb1j2jPI?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#@title\n",
"from IPython.display import HTML\n",
"\n",
"HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/M6adb1j2jPI?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the Transformers and Datasets libraries to run this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install datasets transformers[sentencepiece]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]\n",
"[1045, 5223, 2023, 1012]\n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"sentences = [\n",
" \"I've been waiting for a HuggingFace course my whole life.\",\n",
" \"I hate this.\",\n",
"]\n",
"tokens = [tokenizer.tokenize(sentence) for sentence in sentences]\n",
"ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]\n",
"print(ids[0])\n",
"print(ids[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "expected sequence of length 14 at dim 1 (got 4)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-2-b3483c81e2fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m [1045, 5223, 2023, 1012]]\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0minput_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mValueError\u001b[0m: expected sequence of length 14 at dim 1 (got 4)"
]
}
],
"source": [
"import torch\n",
"\n",
"ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
" [1045, 5223, 2023, 1012]]\n",
"\n",
"input_ids = torch.tensor(ids)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"\n",
"ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
" [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
"\n",
"input_ids = torch.tensor(ids)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"tokenizer.pad_token_id"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[-2.7276, 2.8789]], grad_fn=<AddmmBackward>)\n",
"tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)\n",
"tensor([[-2.7276, 2.8789],\n",
" [ 1.5444, -1.3998]], grad_fn=<AddmmBackward>)\n"
]
}
],
"source": [
"from transformers import AutoModelForSequenceClassification\n",
"\n",
"ids1 = torch.tensor(\n",
" [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]\n",
")\n",
"ids2 = torch.tensor([[1045, 5223, 2023, 1012]])\n",
"all_ids = torch.tensor(\n",
" [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
" [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
")\n",
"\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
"print(model(ids1).logits)\n",
"print(model(ids2).logits)\n",
"print(model(all_ids).logits)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_ids = torch.tensor(\n",
" [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
" [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
")\n",
"attention_mask = torch.tensor(\n",
" [[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
" [ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[-2.7276, 2.8789]], grad_fn=<AddmmBackward>)\n",
"tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)\n"
]
}
],
"source": [
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
"output1 = model(ids1)\n",
"output2 = model(ids2)\n",
"print(output1.logits)\n",
"print(output2.logits)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[-2.7276, 2.8789],\n",
" [ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)\n"
]
}
],
"source": [
"output = model(all_ids, attention_mask=attention_mask)\n",
"print(output.logits)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}\n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"sentences = [\n",
" \"I've been waiting for a HuggingFace course my whole life.\",\n",
" \"I hate this.\",\n",
"]\n",
"print(tokenizer(sentences, padding=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"name": "Batching inputs together (PyTorch)",
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 4
}