course/videos/save_load_dataset.ipynb (161 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form"
},
"outputs": [
{
"data": {
"text/html": [
"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/blF9uxYcKHo?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#@title\n",
"from IPython.display import HTML\n",
"\n",
"HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/blF9uxYcKHo?rel=0&controls=0&showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the Transformers and Datasets libraries to run this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install datasets transformers[sentencepiece]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"\n",
"raw_datasets = load_dataset(\"allocine\")\n",
"raw_datasets.cache_files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"raw_datasets.save_to_disk(\"my-arrow-datasets\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_from_disk\n",
"\n",
"arrow_datasets_reloaded = load_from_disk(\"my-arrow-datasets\")\n",
"arrow_datasets_reloaded"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for split, dataset in raw_datasets.items():\n",
" dataset.to_csv(f\"my-dataset-{split}.csv\", index=None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_files = {\n",
" \"train\": \"my-dataset-train.csv\",\n",
" \"validation\": \"my-dataset-validation.csv\",\n",
" \"test\": \"my-dataset-test.csv\",\n",
"}\n",
"\n",
"csv_datasets_reloaded = load_dataset(\"csv\", data_files=data_files)\n",
"csv_datasets_reloaded"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save in JSON Lines format\n",
"for split, dataset in raw_datasets.items():\n",
" dataset.to_json(f\"my-dataset-{split}.jsonl\")\n",
"\n",
"# Save in Parquet format\n",
"for split, dataset in raw_datasets.items():\n",
" dataset.to_parquet(f\"my-dataset-{split}.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"json_data_files = {\n",
" \"train\": \"my-dataset-train.jsonl\",\n",
" \"validation\": \"my-dataset-validation.jsonl\",\n",
" \"test\": \"my-dataset-test.jsonl\",\n",
"}\n",
"\n",
"parquet_data_files = {\n",
" \"train\": \"my-dataset-train.parquet\",\n",
" \"validation\": \"my-dataset-validation.parquet\",\n",
" \"test\": \"my-dataset-test.parquet\",\n",
"}\n",
"\n",
"# Reload with the `json` script\n",
"json_datasets_reloaded = load_dataset(\"json\", data_files=json_data_files)\n",
"# Reload with the `parquet` script\n",
"parquet_datasets_reloaded = load_dataset(\"parquet\", data_files=parquet_data_files)"
]
}
],
"metadata": {
"colab": {
"name": "Saving and reloading a dataset",
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 4
}