course/videos/token_processing.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/iY2AZYdZAr0?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/iY2AZYdZAr0?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset(\"conll2003\")\n", "raw_datasets = raw_datasets.remove_columns([\"chunk_tags\", \"id\", \"pos_tags\"])\n", "raw_datasets = raw_datasets.rename_column(\"ner_tags\", \"labels\")\n", "raw_datasets = raw_datasets.rename_column(\"tokens\", \"words\")\n", "raw_datasets[\"train\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(raw_datasets[\"train\"][0][\"words\"])\n", "print(raw_datasets[\"train\"][0][\"labels\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "label_names = raw_datasets[\"train\"].features[\"labels\"].feature.names\n", "label_names" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "model_checkpoint = \"bert-base-cased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "\n", "inputs = tokenizer(raw_datasets[\"train\"][0][\"words\"], is_split_into_words=True)\n", "inputs.tokens()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def shift_label(label):\n", " # If the label is B-XXX we change it to I-XXX\n", " if label % 2 == 1:\n", " label += 1\n", " return label\n", "\n", "def align_labels_with_tokens(labels, word_ids):\n", " new_labels = []\n", " current_word = None\n", " for word_id in word_ids:\n", " if word_id is None:\n", " new_labels.append(-100)\n", " elif word_id != current_word:\n", " # Start of a new word!\n", " current_word = word_id\n", " new_labels.append(labels[word_id])\n", " else:\n", " new_labels.append(shift_label(labels[word_id]))\n", "\n", " return new_labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize_and_align_labels(examples):\n", " tokenized_inputs = tokenizer(examples[\"words\"], truncation=True, is_split_into_words=True)\n", " new_labels = []\n", " for i, labels in enumerate(examples[\"labels\"]):\n", " word_ids = tokenized_inputs.word_ids(i)\n", " new_labels.append(align_labels_with_tokens(labels, word_ids))\n", "\n", " tokenized_inputs[\"labels\"] = new_labels\n", " return tokenized_inputs\n", "\n", "tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorForTokenClassification\n", "\n", "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "Data processing for Token Classification", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }

course/videos/token_processing.ipynb (178 lines of code) (raw):