course/videos/save_load_dataset.ipynb (161 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/blF9uxYcKHo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/blF9uxYcKHo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset(\"allocine\")\n", "raw_datasets.cache_files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "raw_datasets.save_to_disk(\"my-arrow-datasets\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_from_disk\n", "\n", "arrow_datasets_reloaded = load_from_disk(\"my-arrow-datasets\")\n", "arrow_datasets_reloaded" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for split, dataset in raw_datasets.items():\n", " dataset.to_csv(f\"my-dataset-{split}.csv\", index=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_files = {\n", " \"train\": \"my-dataset-train.csv\",\n", " \"validation\": \"my-dataset-validation.csv\",\n", " \"test\": \"my-dataset-test.csv\",\n", "}\n", "\n", "csv_datasets_reloaded = load_dataset(\"csv\", data_files=data_files)\n", "csv_datasets_reloaded" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save in JSON Lines format\n", "for split, dataset in raw_datasets.items():\n", " dataset.to_json(f\"my-dataset-{split}.jsonl\")\n", "\n", "# Save in Parquet format\n", "for split, dataset in raw_datasets.items():\n", " dataset.to_parquet(f\"my-dataset-{split}.parquet\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "json_data_files = {\n", " \"train\": \"my-dataset-train.jsonl\",\n", " \"validation\": \"my-dataset-validation.jsonl\",\n", " \"test\": \"my-dataset-test.jsonl\",\n", "}\n", "\n", "parquet_data_files = {\n", " \"train\": \"my-dataset-train.parquet\",\n", " \"validation\": \"my-dataset-validation.parquet\",\n", " \"test\": \"my-dataset-test.parquet\",\n", "}\n", "\n", "# Reload with the `json` script\n", "json_datasets_reloaded = load_dataset(\"json\", data_files=json_data_files)\n", "# Reload with the `parquet` script\n", "parquet_datasets_reloaded = load_dataset(\"parquet\", data_files=parquet_data_files)" ] } ], "metadata": { "colab": { "name": "Saving and reloading a dataset", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }