course/videos/train_new_tokenizer.ipynb (173 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/DJimQynXZsQ?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/DJimQynXZsQ?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import BertTokenizerFast\n", "\n", "tokenizer = BertTokenizerFast.from_pretrained(\n", " 'huggingface-course/bert-base-uncased-tokenizer-without-normalizer'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = \"here is a sentence adapted to our tokenizer\"\n", "print(tokenizer.tokenize(text))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = \"এই বাক্যটি আমাদের টোকেনাইজারের উপযুক্ত নয়\"\n", "print(tokenizer.tokenize(text))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = \"this tokenizer does not know àccënts and CAPITAL LETTERS\"\n", "print(tokenizer.tokenize(text))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = \"the medical vocabulary is divided into many sub-token: paracetamol, phrayngitis\"\n", "print(tokenizer.tokenize(text))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset(\"code_search_net\", \"python\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_training_corpus():\n", " dataset = raw_datasets[\"train\"]\n", " for start_idx in range(0, len(dataset), 1000):\n", " samples = dataset[start_idx : start_idx + 1000]\n", " yield samples[\"whole_func_string\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "old_tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", "training_corpus = get_training_corpus()\n", "new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)\n", "new_tokenizer.save_pretrained(\"code-search-net-tokenizer\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "example = \"\"\"class LinearLayer():\n", " def __init__(self, input_size, output_size):\n", " self.weight = torch.randn(input_size, output_size)\n", " self.bias = torch.zeros(output_size)\n", "\n", " def __call__(self, x):\n", " return x @ self.weights + self.bias\n", " \"\"\"\n", "\n", "print(old_tokenizer.tokenize(example))\n", "print(new_tokenizer.tokenize(example))" ] } ], "metadata": { "colab": { "name": "Training a new tokenizer", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }