course/videos/translation_processing.ipynb (176 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/XAR8jnZZuUs?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/XAR8jnZZuUs?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, load_metric\n", "\n", "raw_datasets = load_dataset(\"kde4\", lang1=\"en\", lang2=\"fr\")\n", "\n", "def extract_languages(examples):\n", " inputs = [ex[\"en\"] for ex in examples[\"translation\"]]\n", " targets = [ex[\"fr\"] for ex in examples[\"translation\"]]\n", " return {\"inputs\": inputs, \"targets\": targets}\n", "\n", "raw_datasets = raw_datasets.map(extract_languages, batched=True, remove_columns=[\"id\", \"translation\"])\n", "raw_datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(raw_datasets[\"train\"][10])\n", "print(raw_datasets[\"train\"][11])\n", "print(raw_datasets[\"train\"][12])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "\n", "sample = raw_datasets[\"train\"][12]\n", "inputs = tokenizer(sample[\"inputs\"])\n", "targets = tokenizer(sample[\"targets\"])\n", "\n", "\n", "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n", "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "\n", "sample = raw_datasets[\"train\"][12]\n", "inputs = tokenizer(sample[\"inputs\"])\n", "with tokenizer.as_target_tokenizer():\n", " targets = tokenizer(sample[\"targets\"])\n", "\n", "\n", "print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n", "print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "max_input_length = 128\n", "max_target_length = 128\n", "\n", "def preprocess_function(examples):\n", " model_inputs = tokenizer(examples[\"inputs\"], max_length=max_input_length, truncation=True)\n", "\n", " # Setup the tokenizer for targets\n", " with tokenizer.as_target_tokenizer():\n", " labels = tokenizer(examples[\"targets\"], max_length=max_target_length, truncation=True)\n", "\n", " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", " return model_inputs\n", "\n", "tokenized_datasets = raw_datasets.map(\n", " preprocess_function, batched=True, remove_columns=[\"inputs\", \"targets\"]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorForSeq2Seq\n", "\n", "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "Data processing for Translation", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }