course/videos/building_tokenizer.ipynb (171 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form"
},
"outputs": [
{
"data": {
"text/html": [
"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/MR8tZm5ViWU?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#@title\n",
"from IPython.display import HTML\n",
"\n",
"HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/MR8tZm5ViWU?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the Transformers and Datasets libraries to run this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install datasets transformers[sentencepiece]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"\n",
"dataset = load_dataset(\"wikitext\", name=\"wikitext-2-raw-v1\", split=\"train\")\n",
"\n",
"\n",
"def get_training_corpus():\n",
" for i in range(0, len(dataset), 1000):\n",
" yield dataset[i : i + 1000][\"text\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = Tokenizer(models.WordPiece(unk_token=\"[UNK]\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenizer.normalizer = normalizers.Sequence(\n",
" [\n",
" normalizers.Replace(Regex(r\"[\\p{Other}&&[^\\n\\t\\r]]\"), \"\"),\n",
" normalizers.Replace(Regex(r\"[\\s]\"), \" \"),\n",
" normalizers.Lowercase(),\n",
" normalizers.NFD(), normalizers.StripAccents()]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"special_tokens = [\"[UNK]\", \"[PAD]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"]\n",
"trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n",
"sep_token_id = tokenizer.token_to_id(\"[SEP]\")\n",
"tokenizer.post_processor = processors.TemplateProcessing(\n",
" single=f\"[CLS]:0 $A:0 [SEP]:0\",\n",
" pair=f\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n",
" special_tokens=[(\"[CLS]\", cls_token_id), (\"[SEP]\", sep_token_id)],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenizer.decoder = decoders.WordPiece(prefix=\"##\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"name": "Building a new tokenizer",
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 4
}