course/videos/clm_processing.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "from datasets import load_dataset, DatasetDict\n", "\n", "ds_train = load_dataset(\"huggingface-course/codeparrot-ds-train\", split=\"train\")\n", "ds_valid = load_dataset(\"huggingface-course/codeparrot-ds-valid\", split=\"train\")\n", "\n", "raw_datasets = DatasetDict(\n", " {\n", " \"train\": ds_train,\n", " \"valid\": ds_valid,\n", " }\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"huggingface-course/code-search-net-tokenizer\")\n", "model = AutoModelForCausalLM.from_pretrained(\"huggingface-course/codeparrot-ds\")\n", "batch = tokenizer([\"import numpy as np\"], return_tensors=\"pt\")\n", "\n", "text = \"import numpy as np\\n\"*20\n", "context_length = 128" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "outputs = tokenizer(\n", " text,\n", " truncation=True,\n", " max_length=16,\n", " return_overflowing_tokens=True,\n", " return_length=True,\n", " )\n", "\n", "print(f\"Input chunk lengths: {(outputs['length'])}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize(element):\n", " outputs = tokenizer(\n", " element[\"content\"],\n", " truncation=True,\n", " max_length=context_length,\n", " return_overflowing_tokens=True,\n", " return_length=True,\n", " )\n", " input_batch = []\n", " for length, input_ids in zip(outputs[\"length\"], outputs[\"input_ids\"]):\n", " if length == context_length:\n", " input_batch.append(input_ids)\n", " return {\"input_ids\": input_batch}\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(\n", " tokenize, batched=True, remove_columns=raw_datasets[\"train\"].column_names\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "output = model(input_ids=batch[\"input_ids\"], labels=batch[\"input_ids\"])\n", "loss = output.loss" ] } ], "metadata": { "colab": { "name": "Data processing for Causal Language Modeling", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }

course/videos/clm_processing.ipynb (143 lines of code) (raw):