course/videos/mlm_processing.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/8PmhEIXhBvI?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/8PmhEIXhBvI?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\")\n", "raw_datasets[\"train\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from transformers import AutoTokenizer\n", "\n", "raw_datasets = load_dataset(\"imdb\")\n", "raw_datasets = raw_datasets.remove_columns(\"label\")\n", "\n", "model_checkpoint = \"distilbert-base-cased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "context_length = 128\n", "\n", "def tokenize_pad_and_truncate(texts):\n", " return tokenizer(texts[\"text\"], truncation=True, padding=\"max_length\", max_length=context_length)\n", "\n", "tokenized_datasets = raw_datasets.map(tokenize_pad_and_truncate, batched=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize_and_chunk(texts):\n", " return tokenizer(\n", " texts[\"text\"], truncation=True, max_length=context_length,\n", " return_overflowing_tokens=True\n", " )\n", "\n", "tokenized_datasets = raw_datasets.map(\n", " tokenize_and_chunk, batched=True, remove_columns=[\"text\"]\n", ")\n", "\n", "len(raw_datasets[\"train\"]), len(tokenized_datasets[\"train\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize_and_chunk(texts):\n", " all_input_ids = []\n", " for input_ids in tokenizer(texts[\"text\"])[\"input_ids\"]:\n", " all_input_ids.extend(input_ids)\n", " all_input_ids.append(tokenizer.eos_token_id)\n", " \n", " chunks = []\n", " for idx in range(0, len(all_input_ids), context_length):\n", " chunks.append(all_input_ids[idx: idx + context_length])\n", " return {\"input_ids\": chunks}\n", "\n", "tokenized_datasets = raw_datasets.map(tokenize_and_chunk, batched=True, remove_columns=[\"text\"])\n", "\n", "len(raw_datasets[\"train\"]), len(tokenized_datasets[\"train\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorForLanguageModeling\n", "\n", "data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "Data processing for Masked Language Modeling", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }

course/videos/mlm_processing.ipynb (156 lines of code) (raw):