course/videos/building_tokenizer.ipynb (171 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/MR8tZm5ViWU?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/MR8tZm5ViWU?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"wikitext\", name=\"wikitext-2-raw-v1\", split=\"train\")\n", "\n", "\n", "def get_training_corpus():\n", " for i in range(0, len(dataset), 1000):\n", " yield dataset[i : i + 1000][\"text\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer(models.WordPiece(unk_token=\"[UNK]\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.normalizer = normalizers.Sequence(\n", " [\n", " normalizers.Replace(Regex(r\"[\\p{Other}&&[^\\n\\t\\r]]\"), \"\"),\n", " normalizers.Replace(Regex(r\"[\\s]\"), \" \"),\n", " normalizers.Lowercase(),\n", " normalizers.NFD(), normalizers.StripAccents()]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "special_tokens = [\"[UNK]\", \"[PAD]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"]\n", "trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n", "sep_token_id = tokenizer.token_to_id(\"[SEP]\")\n", "tokenizer.post_processor = processors.TemplateProcessing(\n", " single=f\"[CLS]:0 $A:0 [SEP]:0\",\n", " pair=f\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n", " special_tokens=[(\"[CLS]\", cls_token_id), (\"[SEP]\", sep_token_id)],\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.decoder = decoders.WordPiece(prefix=\"##\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "Building a new tokenizer", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }