course/videos/datasets_and

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/tfcY1067A5Q?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/tfcY1067A5Q?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"swiss_judgment_prediction\", \"all_languages\", split=\"train\")\n", "dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Convert the output format to pandas.DataFrame\n", "dataset.set_format(\"pandas\")\n", "dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset.__getitem__(0)\n", "\n", "dataset.set_format(\"pandas\")\n", "\n", "dataset.__getitem__(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = dataset.to_pandas()\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# How are languages distributed across regions?\n", "df.groupby(\"region\")[\"language\"].value_counts()\n", "\n", "# Which legal area is most common?\n", "df[\"legal area\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "# Load a pretrained tokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", "# Tokenize the `text` column\n", "dataset.map(lambda x : tokenizer(x[\"text\"]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Reset back to Arrow format\n", "dataset.reset_format()\n", "# Now we can tokenize!\n", "dataset.map(lambda x : tokenizer(x[\"text\"]))" ] } ], "metadata": { "colab": { "name": "Datasets + DataFrames = ❤️", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }

course/videos/datasets_and_dataframes.ipynb (148 lines of code) (raw):