course/videos/debug_notebook.ipynb (289 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/rSPyvPw0p9k?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/rSPyvPw0p9k?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset(\"conll2003\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "raw_datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "raw_datasets[\"train\"][0][\"tokens\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "raw_datasets[\"train\"][0][\"ner_tags\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ner_feature = raw_datasets[\"train\"].features[\"ner_tags\"]\n", "ner_feature" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "label_names = ner_feature.feature.names\n", "label_names" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "words = raw_datasets[\"train\"][0][\"tokens\"]\n", "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", "line1 = \"\"\n", "line2 = \"\"\n", "for word, label in zip(words, labels):\n", " full_label = label_names[label]\n", " max_length = max(len(word), len(full_label))\n", " line1 += word + \" \" * (max_length - len(word) + 1)\n", " line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n", "\n", "print(line1)\n", "print(line2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "model_checkpoint = \"bert-base-cased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.is_fast" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n", "inputs.tokens()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inputs.word_ids()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def align_labels_with_tokens(labels, word_ids):\n", " new_labels = []\n", " current_word = None\n", " for word_id in word_ids:\n", " if word_id != current_word:\n", " # Start of a new word!\n", " current_word = word_id\n", " label = -100 if word_id is None else labels[word_id]\n", " new_labels.append(label)\n", " elif word_id is None:\n", " # Special token\n", " new_labels.append(-100)\n", " else:\n", " # Same word as previous token\n", " label = labels[word_id]\n", " # If the label is B-XXX we change it to I-XXX\n", " if label % 2 == 1:\n", " label += 1\n", " new_labels.append(label)\n", "\n", " return new_labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", "word_ids = inputs.word_ids()\n", "print(labels)\n", "print(align_labels_with_tokens(labels, word_ids))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize_and_align_labels(examples):\n", " tokenized_inputs = tokenizer(\n", " examples[\"tokens\"], truncation=True, is_split_into_words=True\n", " )\n", " all_labels = examples[\"ner_tags\"]\n", " new_labels = []\n", " for i, labels in enumerate(all_labels):\n", " word_ids = tokenized_inputs.word_ids(i)\n", " new_labels.append(align_labels_with_tokens(labels, word_ids))\n", "\n", " tokenized_inputs[\"labels\"] = new_labels\n", " return tokenized_inputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenized_datasets = raw_datasets.map(\n", " tokenize_and_align_labels,\n", " batched=True,\n", " remove_columns=raw_datasets[\"train\"].column_names,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "features = [tokenized_datasets[\"train\"][i] for i in range(8)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "batch = tokenizer.pad(features, return_tensors=\"pt\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%debug" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "Using a debugger in a notebook", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }