course/videos/translation_processing.ipynb (176 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form"
},
"outputs": [
{
"data": {
"text/html": [
"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/XAR8jnZZuUs?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#@title\n",
"from IPython.display import HTML\n",
"\n",
"HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/XAR8jnZZuUs?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the Transformers and Datasets libraries to run this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install datasets transformers[sentencepiece]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset, load_metric\n",
"\n",
"raw_datasets = load_dataset(\"kde4\", lang1=\"en\", lang2=\"fr\")\n",
"\n",
"def extract_languages(examples):\n",
" inputs = [ex[\"en\"] for ex in examples[\"translation\"]]\n",
" targets = [ex[\"fr\"] for ex in examples[\"translation\"]]\n",
" return {\"inputs\": inputs, \"targets\": targets}\n",
"\n",
"raw_datasets = raw_datasets.map(extract_languages, batched=True, remove_columns=[\"id\", \"translation\"])\n",
"raw_datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(raw_datasets[\"train\"][10])\n",
"print(raw_datasets[\"train\"][11])\n",
"print(raw_datasets[\"train\"][12])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
"\n",
"sample = raw_datasets[\"train\"][12]\n",
"inputs = tokenizer(sample[\"inputs\"])\n",
"targets = tokenizer(sample[\"targets\"])\n",
"\n",
"\n",
"print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n",
"print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
"\n",
"sample = raw_datasets[\"train\"][12]\n",
"inputs = tokenizer(sample[\"inputs\"])\n",
"with tokenizer.as_target_tokenizer():\n",
" targets = tokenizer(sample[\"targets\"])\n",
"\n",
"\n",
"print(tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"]))\n",
"print(tokenizer.convert_ids_to_tokens(targets[\"input_ids\"]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"max_input_length = 128\n",
"max_target_length = 128\n",
"\n",
"def preprocess_function(examples):\n",
" model_inputs = tokenizer(examples[\"inputs\"], max_length=max_input_length, truncation=True)\n",
"\n",
" # Setup the tokenizer for targets\n",
" with tokenizer.as_target_tokenizer():\n",
" labels = tokenizer(examples[\"targets\"], max_length=max_target_length, truncation=True)\n",
"\n",
" model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
" return model_inputs\n",
"\n",
"tokenized_datasets = raw_datasets.map(\n",
" preprocess_function, batched=True, remove_columns=[\"inputs\", \"targets\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import DataCollatorForSeq2Seq\n",
"\n",
"data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"name": "Data processing for Translation",
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 4
}