demo.ipynb

{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "8166a1c6bb7797bb", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:34:34.171635Z", "start_time": "2024-11-27T07:34:34.161464Z" } }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "id": "d277e88f7ea91092", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:34:34.179778Z", "start_time": "2024-11-27T07:34:34.174652Z" } }, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "id": "19b1960e-9e0a-401f-be15-d343902eaa21", "metadata": {}, "source": [ "# Spark HuggingFace Connector Demo" ] }, { "cell_type": "markdown", "id": "c9a7bf1d-c208-4873-9e06-5db981f8eeaa", "metadata": {}, "source": [ "## Create a Spark Session" ] }, { "cell_type": "code", "execution_count": null, "id": "620d3ecb-b9cb-480c-b300-69198cce7a9c", "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "\n", "spark = (\n", " SparkSession.builder\n", " .config(\"spark.executor.memory\", \"20G\") \n", " .getOrCreate()\n", ")" ] }, { "cell_type": "markdown", "id": "6f876028-2af5-4e63-8e9d-59afc0959267", "metadata": {}, "source": [ "## Load a dataset as a Spark DataFrame\n", "\n", "By default the connector is using Streaming Dataset: `load_dataset(..., streaming=True)`" ] }, { "cell_type": "code", "execution_count": 2, "id": "b8580bde-3f64-4c71-a087-8b3f71099aee", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:10:11.691797Z", "start_time": "2024-11-27T07:09:59.993537Z" } }, "outputs": [], "source": [ "df = spark.read.format(\"huggingface\").load(\"cornell-movie-review-data/rotten_tomatoes\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "3bbf61d1-4c2c-40e7-9790-2722637aac9d", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:10:11.707299Z", "start_time": "2024-11-27T07:10:11.695157Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- text: string (nullable = true)\n", " |-- label: long (nullable = true)\n", "\n" ] } ], "source": [ "df.printSchema()" ] }, { "cell_type": "code", "execution_count": 4, "id": "7f7b9a2b-8733-499a-af56-3c51196d060f", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:10:52.640366Z", "start_time": "2024-11-27T07:10:52.415881Z" } }, "outputs": [ { "data": { "text/plain": [ "DataFrame[text: string, label: bigint]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Cache the dataframe to avoid re-downloading data. Note this should be used for small datasets.\n", "df.cache()" ] }, { "cell_type": "code", "execution_count": 5, "id": "df121dba-2e1e-4206-b2bf-db156c298ee1", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:11:26.796618Z", "start_time": "2024-11-27T07:10:59.645232Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/allison.wang/.pyenv/versions/3.11.10/lib/python3.11/multiprocessing/resource_tracker.py:254: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown\n", " warnings.warn('resource_tracker: There appear to be %d '\n", "24/11/27 15:11:14 WARN CheckAllocator: More than one DefaultAllocationManager on classpath. Choosing first found\n", " \r" ] }, { "data": { "text/plain": [ "8530" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Trigger the cache computation\n", "df.count()" ] }, { "cell_type": "code", "execution_count": 6, "id": "8866bdfb-0782-4430-8b1e-09c65e699f41", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:11:35.994254Z", "start_time": "2024-11-27T07:11:35.931924Z" }, "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Row(text='the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', label=1)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 7, "id": "225bbbef-4164-424d-a701-c6c74494ef81", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:11:41.923050Z", "start_time": "2024-11-27T07:11:41.754692Z" } }, "outputs": [ { "data": { "text/plain": [ "4265" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Then you can operate on this dataframe\n", "df.filter(df.label == 0).count()" ] }, { "cell_type": "markdown", "id": "bae9bc7f48526c36", "metadata": {}, "source": [ "## Load a Dataset with a configuration/subset\n", "Some datasets require explicitly specifying the config name. You can pass this as a data source option." ] }, { "cell_type": "markdown", "id": "3932f1fd-a324-4f15-86e1-bbe1064d707a", "metadata": {}, "source": [ "## Load a different split\n", "You can specify the `split` data source option" ] }, { "cell_type": "code", "execution_count": 8, "id": "a16e9270-eb02-4568-8739-db4dc715c274", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:12:02.814196Z", "start_time": "2024-11-27T07:11:54.300211Z" } }, "outputs": [], "source": [ "test_df = (\n", " spark.read.format(\"huggingface\")\n", " .option(\"split\", \"test\")\n", " .load(\"cornell-movie-review-data/rotten_tomatoes\")\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "id": "3aec5719-c3a1-4d18-92c8-2b0c2f4bb939", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:12:02.827481Z", "start_time": "2024-11-27T07:12:02.817828Z" } }, "outputs": [ { "data": { "text/plain": [ "DataFrame[text: string, label: bigint]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df.cache()" ] }, { "cell_type": "code", "execution_count": 10, "id": "d605289d-361d-4a6c-9b70-f7ccdff3aa9d", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:12:16.765461Z", "start_time": "2024-11-27T07:12:02.891782Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "data": { "text/plain": [ "1066" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df.count()" ] }, { "cell_type": "code", "execution_count": 11, "id": "df1ad003-1476-4557-811b-31c3888c0030", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:12:16.905233Z", "start_time": "2024-11-27T07:12:16.825661Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----+\n", "| text|label|\n", "+--------------------+-----+\n", "|lovingly photogra...| 1|\n", "|consistently clev...| 1|\n", "|it's like a \" big...| 1|\n", "|the story gives a...| 1|\n", "|red dragon \" neve...| 1|\n", "+--------------------+-----+\n", "only showing top 5 rows\n" ] } ], "source": [ "test_df.show(n=5)" ] }, { "cell_type": "markdown", "id": "d8481e86aeb61aaf", "metadata": {}, "source": [ "## Load a dataset with multiple shards\n", "\n", "This example is using the [amazon_popularity dataset](https://huggingface.co/datasets/fancyzhx/amazon_polarity) which has 4 shards (for train split)" ] }, { "cell_type": "code", "execution_count": 12, "id": "43759f8c136366b8", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:12:25.864047Z", "start_time": "2024-11-27T07:12:16.919834Z" } }, "outputs": [], "source": [ "df = spark.read.format(\"huggingface\").load(\"fancyzhx/amazon_polarity\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "acccc2c299be9205", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:13:04.733705Z", "start_time": "2024-11-27T07:12:50.016560Z" } }, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# You can see there are 4 partitions, each correspond to one shard.\n", "df.rdd.getNumPartitions()" ] }, { "cell_type": "markdown", "id": "ae7ad16dfecf0e4c", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "3587271d9a4f31ac", "metadata": {}, "source": [ "## Load a dataset without streaming\n", "\n", "This is equivalent to `load_dataset(..., streaming=False)`" ] }, { "cell_type": "code", "execution_count": null, "id": "5d319d7e93545788", "metadata": {}, "outputs": [], "source": [ "df = spark.read.format(\"huggingface\").option(\"streaming\", \"false\").load(\"stanfordnlp/imdb\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "bb4cdc5d8de427ea", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:15:03.325628Z", "start_time": "2024-11-27T07:14:39.711977Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Stage 13:> (0 + 1) / 1]\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----+\n", "| text|label|\n", "+--------------------+-----+\n", "|I rented I AM CUR...| 0|\n", "|\"I Am Curious: Ye...| 0|\n", "|If only to avoid ...| 0|\n", "|This film was pro...| 0|\n", "|Oh, brother...aft...| 0|\n", "+--------------------+-----+\n", "only showing top 5 rows\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] } ], "source": [ "df.show(n=5)" ] }, { "cell_type": "code", "execution_count": 16, "id": "f444cb40b7ae5044", "metadata": { "ExecuteTime": { "end_time": "2024-11-27T07:17:07.400787Z", "start_time": "2024-11-27T07:16:43.681788Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Stage 14:> (0 + 1) / 1]\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----+\n", "| text|label|\n", "+--------------------+-----+\n", "|Zentropa has much...| 1|\n", "|Zentropa is the m...| 1|\n", "|Lars Von Trier is...| 1|\n", "|*Contains spoiler...| 1|\n", "|That was the firs...| 1|\n", "+--------------------+-----+\n", "only showing top 5 rows\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " \r" ] } ], "source": [ "df.filter(df.label == 1).show(n=5)" ] }, { "cell_type": "code", "execution_count": null, "id": "a2da54a5cefe1fa3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "pyspark_huggingface", "language": "python", "name": "pyspark_huggingface" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.10" } }, "nbformat": 4, "nbformat_minor": 5 }

demo.ipynb (568 lines of code) (raw):