course/videos/memory_mapping_streaming.ipynb (167 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/JwISwTCPPWo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/JwISwTCPPWo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "data_files = \"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst\"\n", "large_dataset = load_dataset(\"json\", data_files=data_files, split=\"train\")\n", "size_gb = large_dataset.dataset_size / (1024 ** 3)\n", "print(f\"Dataset size (cache file) : {size_gb:.2f} GB\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import psutil\n", "\n", "# Process.memory_info is expressed in bytes, so convert to megabytes\n", "print(f\"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import timeit\n", "\n", "code_snippet = \"\"\"batch_size = 1000\n", "\n", "for idx in range(0, len(large_dataset), batch_size):\n", " _ = large_dataset[idx:idx + batch_size]\n", "\"\"\"\n", "\n", "time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())\n", "print(\n", " f\"Iterated over {len(large_dataset)} examples (about {size_gb:.1f} GB) in \"\n", " f\"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "large_dataset_streamed = load_dataset(\n", " \"json\", data_files=data_files, split=\"train\", streaming=True)\n", "\n", "next(iter(large_dataset_streamed))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "type(large_dataset_streamed)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "tokenized_dataset = large_dataset_streamed.map(lambda x: tokenizer(x[\"text\"]))\n", "next(iter(tokenized_dataset))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Select the first 5 examples \n", "dataset_head = large_dataset_streamed.take(5)\n", "list(dataset_head)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Skip the first 1,000 examples and include the rest in the training set\n", "train_dataset = large_dataset_streamed.skip(1000)\n", "# Take the first 1,000 examples for the validation set\n", "validation_dataset = large_dataset_streamed.take(1000)" ] } ], "metadata": { "colab": { "name": "Memory Mapping & streaming", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }