community-artifacts/Utilities/Column-vector-operations-v1.ipynb (2,553 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Column and vector operations\n", "\n", "Column and vector operations were added in 1.15.\n", "\n", "* cols2vec\n", "* vec2cols\n", "* drop columns" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", " \"You should import from traitlets.config instead.\", ShimWarning)\n", "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" ] } ], "source": [ "%load_ext sql" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "u'Connected: gpadmin@madlib'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Greenplum Database 5.4.0 on GCP (demo machine)\n", "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", " \n", "# PostgreSQL local\n", "#%sql postgresql://fmcquillan@localhost:5432/madlib" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>version</th>\n", " </tr>\n", " <tr>\n", " <td>MADlib version: 1.15-dev, git revision: rc/1.14-rc1-52-g1a7c756, cmake configuration time: Tue Jul 31 20:31:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-52-g1a7c756, cmake configuration time: Tue Jul 31 20:31:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql select madlib.version();\n", "#%sql select version();" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cols2vec\n", "\n", "Convert feature columns in a table into an array in a single column.\n", "\n", "Given a table with a number of feature columns, this function will create an output table that contains the feature columns in an array. A summary table will also be created that contains the names of the features combined into array, so that this process can be reversed using the function vec2cols." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Load sample data" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "14 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>OUTLOOK</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " <th>Temp_Humidity</th>\n", " <th>clouds_airquality</th>\n", " <th>windy</th>\n", " <th>class</th>\n", " <th>observation_weight</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>sunny</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " <td>[85.0, 85.0]</td>\n", " <td>[u'none', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>sunny</td>\n", " <td>80.0</td>\n", " <td>90.0</td>\n", " <td>[80.0, 90.0]</td>\n", " <td>[u'none', u'moderate']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>overcast</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " <td>[83.0, 78.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>rain</td>\n", " <td>70.0</td>\n", " <td>96.0</td>\n", " <td>[70.0, 96.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>rain</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " <td>[68.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>rain</td>\n", " <td>65.0</td>\n", " <td>70.0</td>\n", " <td>[65.0, 70.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>overcast</td>\n", " <td>64.0</td>\n", " <td>65.0</td>\n", " <td>[64.0, 65.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>sunny</td>\n", " <td>72.0</td>\n", " <td>95.0</td>\n", " <td>[72.0, 95.0]</td>\n", " <td>[u'high', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>sunny</td>\n", " <td>69.0</td>\n", " <td>70.0</td>\n", " <td>[69.0, 70.0]</td>\n", " <td>[u'high', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>rain</td>\n", " <td>75.0</td>\n", " <td>80.0</td>\n", " <td>[75.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>sunny</td>\n", " <td>75.0</td>\n", " <td>70.0</td>\n", " <td>[75.0, 70.0]</td>\n", " <td>[u'none', u'good']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>overcast</td>\n", " <td>72.0</td>\n", " <td>90.0</td>\n", " <td>[72.0, 90.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>overcast</td>\n", " <td>81.0</td>\n", " <td>75.0</td>\n", " <td>[81.0, 75.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>rain</td>\n", " <td>71.0</td>\n", " <td>80.0</td>\n", " <td>[71.0, 80.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", " (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0),\n", " (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5),\n", " (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0),\n", " (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", " (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0),\n", " (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", " (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", " (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0),\n", " (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", " (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0),\n", " (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", " (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5),\n", " (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0)]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS golf CASCADE;\n", "\n", "CREATE TABLE golf (\n", " id integer NOT NULL,\n", " \"OUTLOOK\" text,\n", " temperature double precision,\n", " humidity double precision,\n", " \"Temp_Humidity\" double precision[],\n", " clouds_airquality text[],\n", " windy boolean,\n", " class text,\n", " observation_weight double precision\n", ");\n", "\n", "INSERT INTO golf VALUES\n", "(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),\n", "(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),\n", "(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),\n", "(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),\n", "(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n", "(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),\n", "(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n", "(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),\n", "(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),\n", "(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n", "(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),\n", "(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n", "(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),\n", "(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);\n", "\n", "SELECT * FROM golf ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Create feature array\n", "Combine the temperature and humidity columns into a single array feature." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>feature_vector</th>\n", " </tr>\n", " <tr>\n", " <td>[85.0, 85.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[68.0, 80.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[69.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[81.0, 75.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[80.0, 90.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[65.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[75.0, 80.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[71.0, 80.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[83.0, 78.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[64.0, 65.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[75.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[70.0, 96.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[72.0, 95.0]</td>\n", " </tr>\n", " <tr>\n", " <td>[72.0, 90.0]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[([85.0, 85.0],),\n", " ([68.0, 80.0],),\n", " ([69.0, 70.0],),\n", " ([81.0, 75.0],),\n", " ([80.0, 90.0],),\n", " ([65.0, 70.0],),\n", " ([75.0, 80.0],),\n", " ([71.0, 80.0],),\n", " ([83.0, 78.0],),\n", " ([64.0, 65.0],),\n", " ([75.0, 70.0],),\n", " ([70.0, 96.0],),\n", " ([72.0, 95.0],),\n", " ([72.0, 90.0],)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", "\n", "SELECT madlib.cols2vec(\n", " 'golf',\n", " 'cols2vec_result',\n", " 'temperature, humidity'\n", ");\n", "\n", "SELECT * FROM cols2vec_result;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View the summary table:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>source_table</th>\n", " <th>list_of_features</th>\n", " <th>list_of_features_to_exclude</th>\n", " <th>feature_names</th>\n", " </tr>\n", " <tr>\n", " <td>golf</td>\n", " <td>temperature, humidity</td>\n", " <td>None</td>\n", " <td>[u'temperature', u'humidity']</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'golf', u'temperature, humidity', u'None', [u'temperature', u'humidity'])]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM cols2vec_result_summary;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Create feature array and keep some source table columns\n", "Combine the temperature and humidity columns and keep 2 other columns from source_table." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>OUTLOOK</th>\n", " <th>feature_vector</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>sunny</td>\n", " <td>[85.0, 85.0]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>sunny</td>\n", " <td>[80.0, 90.0]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>overcast</td>\n", " <td>[83.0, 78.0]</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>rain</td>\n", " <td>[70.0, 96.0]</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>rain</td>\n", " <td>[68.0, 80.0]</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>rain</td>\n", " <td>[65.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>overcast</td>\n", " <td>[64.0, 65.0]</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>sunny</td>\n", " <td>[72.0, 95.0]</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>sunny</td>\n", " <td>[69.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>rain</td>\n", " <td>[75.0, 80.0]</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>sunny</td>\n", " <td>[75.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>overcast</td>\n", " <td>[72.0, 90.0]</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>overcast</td>\n", " <td>[81.0, 75.0]</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>rain</td>\n", " <td>[71.0, 80.0]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'sunny', [85.0, 85.0]),\n", " (2, u'sunny', [80.0, 90.0]),\n", " (3, u'overcast', [83.0, 78.0]),\n", " (4, u'rain', [70.0, 96.0]),\n", " (5, u'rain', [68.0, 80.0]),\n", " (6, u'rain', [65.0, 70.0]),\n", " (7, u'overcast', [64.0, 65.0]),\n", " (8, u'sunny', [72.0, 95.0]),\n", " (9, u'sunny', [69.0, 70.0]),\n", " (10, u'rain', [75.0, 80.0]),\n", " (11, u'sunny', [75.0, 70.0]),\n", " (12, u'overcast', [72.0, 90.0]),\n", " (13, u'overcast', [81.0, 75.0]),\n", " (14, u'rain', [71.0, 80.0])]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", "\n", "SELECT madlib.cols2vec(\n", " 'golf',\n", " 'cols2vec_result',\n", " 'temperature, humidity',\n", " NULL,\n", " 'id, \"OUTLOOK\"'\n", ");\n", "\n", "SELECT * FROM cols2vec_result ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View the summary table:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>source_table</th>\n", " <th>list_of_features</th>\n", " <th>list_of_features_to_exclude</th>\n", " <th>feature_names</th>\n", " </tr>\n", " <tr>\n", " <td>golf</td>\n", " <td>temperature, humidity</td>\n", " <td>None</td>\n", " <td>[u'temperature', u'humidity']</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'golf', u'temperature, humidity', u'None', [u'temperature', u'humidity'])]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM cols2vec_result_summary;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Exclude all columns that are not double precision\n", "Combine all columns, excluding all columns that are not of type double precision" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>OUTLOOK</th>\n", " <th>feature_vector</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>sunny</td>\n", " <td>[85.0, 85.0, 5.0]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>sunny</td>\n", " <td>[80.0, 90.0, 5.0]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>overcast</td>\n", " <td>[83.0, 78.0, 1.5]</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>rain</td>\n", " <td>[70.0, 96.0, 1.0]</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>rain</td>\n", " <td>[68.0, 80.0, 1.0]</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>rain</td>\n", " <td>[65.0, 70.0, 1.0]</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>overcast</td>\n", " <td>[64.0, 65.0, 1.5]</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>sunny</td>\n", " <td>[72.0, 95.0, 5.0]</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>sunny</td>\n", " <td>[69.0, 70.0, 5.0]</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>rain</td>\n", " <td>[75.0, 80.0, 1.0]</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>sunny</td>\n", " <td>[75.0, 70.0, 5.0]</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>overcast</td>\n", " <td>[72.0, 90.0, 1.5]</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>overcast</td>\n", " <td>[81.0, 75.0, 1.5]</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>rain</td>\n", " <td>[71.0, 80.0, 1.0]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'sunny', [85.0, 85.0, 5.0]),\n", " (2, u'sunny', [80.0, 90.0, 5.0]),\n", " (3, u'overcast', [83.0, 78.0, 1.5]),\n", " (4, u'rain', [70.0, 96.0, 1.0]),\n", " (5, u'rain', [68.0, 80.0, 1.0]),\n", " (6, u'rain', [65.0, 70.0, 1.0]),\n", " (7, u'overcast', [64.0, 65.0, 1.5]),\n", " (8, u'sunny', [72.0, 95.0, 5.0]),\n", " (9, u'sunny', [69.0, 70.0, 5.0]),\n", " (10, u'rain', [75.0, 80.0, 1.0]),\n", " (11, u'sunny', [75.0, 70.0, 5.0]),\n", " (12, u'overcast', [72.0, 90.0, 1.5]),\n", " (13, u'overcast', [81.0, 75.0, 1.5]),\n", " (14, u'rain', [71.0, 80.0, 1.0])]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", "\n", "SELECT madlib.cols2vec(\n", " 'golf',\n", " 'cols2vec_result',\n", " '*',\n", " '\"OUTLOOK\", \"Temp_Humidity\", clouds_airquality, windy, class, id',\n", " 'id, \"OUTLOOK\"'\n", ");\n", "\n", "SELECT * FROM cols2vec_result ORDER BY id;" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>source_table</th>\n", " <th>list_of_features</th>\n", " <th>list_of_features_to_exclude</th>\n", " <th>feature_names</th>\n", " </tr>\n", " <tr>\n", " <td>golf</td>\n", " <td>*</td>\n", " <td>\"OUTLOOK\", \"Temp_Humidity\", clouds_airquality, windy, class, id</td>\n", " <td>[u'temperature', u'humidity', u'observation_weight']</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'golf', u'*', u'\"OUTLOOK\", \"Temp_Humidity\", clouds_airquality, windy, class, id', [u'temperature', u'humidity', u'observation_weight'])]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM cols2vec_result_summary;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Keep all columns from source table\n", "Combine the temperature and humidity columns, exclude windy, and keep all of the columns from the source table." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>OUTLOOK</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " <th>Temp_Humidity</th>\n", " <th>clouds_airquality</th>\n", " <th>windy</th>\n", " <th>class</th>\n", " <th>observation_weight</th>\n", " <th>feature_vector</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>sunny</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " <td>[85.0, 85.0]</td>\n", " <td>[u'none', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " <td>[85.0, 85.0]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>sunny</td>\n", " <td>80.0</td>\n", " <td>90.0</td>\n", " <td>[80.0, 90.0]</td>\n", " <td>[u'none', u'moderate']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " <td>[80.0, 90.0]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>overcast</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " <td>[83.0, 78.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " <td>[83.0, 78.0]</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>rain</td>\n", " <td>70.0</td>\n", " <td>96.0</td>\n", " <td>[70.0, 96.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " <td>[70.0, 96.0]</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>rain</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " <td>[68.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " <td>[68.0, 80.0]</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>rain</td>\n", " <td>65.0</td>\n", " <td>70.0</td>\n", " <td>[65.0, 70.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " <td>[65.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>overcast</td>\n", " <td>64.0</td>\n", " <td>65.0</td>\n", " <td>[64.0, 65.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " <td>[64.0, 65.0]</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>sunny</td>\n", " <td>72.0</td>\n", " <td>95.0</td>\n", " <td>[72.0, 95.0]</td>\n", " <td>[u'high', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " <td>[72.0, 95.0]</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>sunny</td>\n", " <td>69.0</td>\n", " <td>70.0</td>\n", " <td>[69.0, 70.0]</td>\n", " <td>[u'high', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " <td>[69.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>rain</td>\n", " <td>75.0</td>\n", " <td>80.0</td>\n", " <td>[75.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " <td>[75.0, 80.0]</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>sunny</td>\n", " <td>75.0</td>\n", " <td>70.0</td>\n", " <td>[75.0, 70.0]</td>\n", " <td>[u'none', u'good']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " <td>[75.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>overcast</td>\n", " <td>72.0</td>\n", " <td>90.0</td>\n", " <td>[72.0, 90.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " <td>[72.0, 90.0]</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>overcast</td>\n", " <td>81.0</td>\n", " <td>75.0</td>\n", " <td>[81.0, 75.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " <td>[81.0, 75.0]</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>rain</td>\n", " <td>71.0</td>\n", " <td>80.0</td>\n", " <td>[71.0, 80.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " <td>[71.0, 80.0]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0, [85.0, 85.0]),\n", " (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0, [80.0, 90.0]),\n", " (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5, [83.0, 78.0]),\n", " (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0, [70.0, 96.0]),\n", " (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, [68.0, 80.0]),\n", " (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, [65.0, 70.0]),\n", " (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5, [64.0, 65.0]),\n", " (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0, [72.0, 95.0]),\n", " (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0, [69.0, 70.0]),\n", " (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, [75.0, 80.0]),\n", " (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0, [75.0, 70.0]),\n", " (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5, [72.0, 90.0]),\n", " (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5, [81.0, 75.0]),\n", " (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, [71.0, 80.0])]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", "\n", "SELECT madlib.cols2vec(\n", " 'golf',\n", " 'cols2vec_result',\n", " 'windy, temperature, humidity',\n", " 'windy',\n", " '*'\n", ");\n", "\n", "SELECT * FROM cols2vec_result ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View summary table:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>source_table</th>\n", " <th>list_of_features</th>\n", " <th>list_of_features_to_exclude</th>\n", " <th>feature_names</th>\n", " </tr>\n", " <tr>\n", " <td>golf</td>\n", " <td>windy, temperature, humidity</td>\n", " <td>windy</td>\n", " <td>[u'temperature', u'humidity']</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'golf', u'windy, temperature, humidity', u'windy', [u'temperature', u'humidity'])]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM cols2vec_result_summary;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This also shows that you can exclude features in 'list_of_features_to_exclude' that are in the list of 'list_of_features'. This can be useful if the 'list_of_features' is generated from an expression or subquery." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Boolean casting\n", "\n", "Type casting works as per regular rules of the underlying database. E.g, combining integer and double precisions columns will create a double precision feature vector. For Boolean, do an explicit cast to the target type:" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>feature_vector</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[u'false', u\"Don't Play\"]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[u'true', u\"Don't Play\"]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[u'false', u'Play']</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>[u'false', u'Play']</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>[u'false', u'Play']</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>[u'true', u\"Don't Play\"]</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>[u'true', u'Play']</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>[u'false', u\"Don't Play\"]</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>[u'false', u'Play']</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>[u'false', u'Play']</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>[u'true', u'Play']</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>[u'true', u'Play']</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>[u'false', u'Play']</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>[u'true', u\"Don't Play\"]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [u'false', u\"Don't Play\"]),\n", " (2, [u'true', u\"Don't Play\"]),\n", " (3, [u'false', u'Play']),\n", " (4, [u'false', u'Play']),\n", " (5, [u'false', u'Play']),\n", " (6, [u'true', u\"Don't Play\"]),\n", " (7, [u'true', u'Play']),\n", " (8, [u'false', u\"Don't Play\"]),\n", " (9, [u'false', u'Play']),\n", " (10, [u'false', u'Play']),\n", " (11, [u'true', u'Play']),\n", " (12, [u'true', u'Play']),\n", " (13, [u'false', u'Play']),\n", " (14, [u'true', u\"Don't Play\"])]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", "\n", "SELECT madlib.cols2vec(\n", " 'golf',\n", " 'cols2vec_result',\n", " 'windy::TEXT, class',\n", " NULL,\n", " 'id'\n", ");\n", "\n", "SELECT * FROM cols2vec_result ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Vec2cols\n", "\n", "Converts a feature array in a single column into multiple columns. This process can be used to reverse the function cols2vec.\n", "\n", "Given a table with a column of type array, this function will create an output table that splits this array into multiple columns, one per array element. It includes the option to name the new feature columns, and to include columns from the original table in the output." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Load sample data" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "14 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>OUTLOOK</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " <th>Temp_Humidity</th>\n", " <th>clouds_airquality</th>\n", " <th>windy</th>\n", " <th>class</th>\n", " <th>observation_weight</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>sunny</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " <td>[85.0, 85.0]</td>\n", " <td>[u'none', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>sunny</td>\n", " <td>80.0</td>\n", " <td>90.0</td>\n", " <td>[80.0, 90.0]</td>\n", " <td>[u'none', u'moderate']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>overcast</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " <td>[83.0, 78.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>rain</td>\n", " <td>70.0</td>\n", " <td>96.0</td>\n", " <td>[70.0, 96.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>rain</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " <td>[68.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>rain</td>\n", " <td>65.0</td>\n", " <td>70.0</td>\n", " <td>[65.0, 70.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>overcast</td>\n", " <td>64.0</td>\n", " <td>65.0</td>\n", " <td>[64.0, 65.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>sunny</td>\n", " <td>72.0</td>\n", " <td>95.0</td>\n", " <td>[72.0, 95.0]</td>\n", " <td>[u'high', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>sunny</td>\n", " <td>69.0</td>\n", " <td>70.0</td>\n", " <td>[69.0, 70.0]</td>\n", " <td>[u'high', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>rain</td>\n", " <td>75.0</td>\n", " <td>80.0</td>\n", " <td>[75.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>sunny</td>\n", " <td>75.0</td>\n", " <td>70.0</td>\n", " <td>[75.0, 70.0]</td>\n", " <td>[u'none', u'good']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>overcast</td>\n", " <td>72.0</td>\n", " <td>90.0</td>\n", " <td>[72.0, 90.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>overcast</td>\n", " <td>81.0</td>\n", " <td>75.0</td>\n", " <td>[81.0, 75.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>rain</td>\n", " <td>71.0</td>\n", " <td>80.0</td>\n", " <td>[71.0, 80.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", " (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0),\n", " (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5),\n", " (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0),\n", " (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", " (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0),\n", " (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", " (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", " (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0),\n", " (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", " (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0),\n", " (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", " (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5),\n", " (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0)]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS golf CASCADE;\n", "\n", "CREATE TABLE golf (\n", " id integer NOT NULL,\n", " \"OUTLOOK\" text,\n", " temperature double precision,\n", " humidity double precision,\n", " \"Temp_Humidity\" double precision[],\n", " clouds_airquality text[],\n", " windy boolean,\n", " class text,\n", " observation_weight double precision\n", ");\n", "\n", "INSERT INTO golf VALUES\n", "(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),\n", "(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),\n", "(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),\n", "(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),\n", "(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n", "(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),\n", "(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n", "(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),\n", "(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),\n", "(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n", "(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),\n", "(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n", "(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),\n", "(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);\n", "\n", "SELECT * FROM golf ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Split array into columns\n", "Split the column \"clouds_airquality\" into new columns called \"clouds\" and \"air_quality\". Also keep columns id and \"OUTLOOK\" from the source table." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>OUTLOOK</th>\n", " <th>clouds</th>\n", " <th>air_quality</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>sunny</td>\n", " <td>none</td>\n", " <td>unhealthy</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>sunny</td>\n", " <td>none</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>overcast</td>\n", " <td>low</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>rain</td>\n", " <td>low</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>rain</td>\n", " <td>medium</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>rain</td>\n", " <td>low</td>\n", " <td>unhealthy</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>overcast</td>\n", " <td>medium</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>sunny</td>\n", " <td>high</td>\n", " <td>unhealthy</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>sunny</td>\n", " <td>high</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>rain</td>\n", " <td>medium</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>sunny</td>\n", " <td>none</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>overcast</td>\n", " <td>medium</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>overcast</td>\n", " <td>medium</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>rain</td>\n", " <td>low</td>\n", " <td>unhealthy</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'sunny', u'none', u'unhealthy'),\n", " (2, u'sunny', u'none', u'moderate'),\n", " (3, u'overcast', u'low', u'moderate'),\n", " (4, u'rain', u'low', u'moderate'),\n", " (5, u'rain', u'medium', u'good'),\n", " (6, u'rain', u'low', u'unhealthy'),\n", " (7, u'overcast', u'medium', u'moderate'),\n", " (8, u'sunny', u'high', u'unhealthy'),\n", " (9, u'sunny', u'high', u'good'),\n", " (10, u'rain', u'medium', u'good'),\n", " (11, u'sunny', u'none', u'good'),\n", " (12, u'overcast', u'medium', u'moderate'),\n", " (13, u'overcast', u'medium', u'moderate'),\n", " (14, u'rain', u'low', u'unhealthy')]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS vec2cols_result;\n", "\n", "SELECT madlib.vec2cols(\n", " 'golf', -- source table\n", " 'vec2cols_result', -- output table\n", " 'clouds_airquality', -- column with array entries to split\n", " ARRAY['clouds', 'air_quality'], -- feature names\n", " 'id, \"OUTLOOK\", ' -- columns to keep from source table\n", ");\n", "\n", "SELECT * FROM vec2cols_result ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Auto-generate feature names\n", "Similar to the previous example, except now we keep all columns from source table and do not specify the feature names, so that default names are created." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>OUTLOOK</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " <th>Temp_Humidity</th>\n", " <th>clouds_airquality</th>\n", " <th>windy</th>\n", " <th>class</th>\n", " <th>observation_weight</th>\n", " <th>f1</th>\n", " <th>f2</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>sunny</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " <td>[85.0, 85.0]</td>\n", " <td>[u'none', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " <td>none</td>\n", " <td>unhealthy</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>sunny</td>\n", " <td>80.0</td>\n", " <td>90.0</td>\n", " <td>[80.0, 90.0]</td>\n", " <td>[u'none', u'moderate']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " <td>none</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>overcast</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " <td>[83.0, 78.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " <td>low</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>rain</td>\n", " <td>70.0</td>\n", " <td>96.0</td>\n", " <td>[70.0, 96.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " <td>low</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>rain</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " <td>[68.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " <td>medium</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>rain</td>\n", " <td>65.0</td>\n", " <td>70.0</td>\n", " <td>[65.0, 70.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " <td>low</td>\n", " <td>unhealthy</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>overcast</td>\n", " <td>64.0</td>\n", " <td>65.0</td>\n", " <td>[64.0, 65.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " <td>medium</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>sunny</td>\n", " <td>72.0</td>\n", " <td>95.0</td>\n", " <td>[72.0, 95.0]</td>\n", " <td>[u'high', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " <td>high</td>\n", " <td>unhealthy</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>sunny</td>\n", " <td>69.0</td>\n", " <td>70.0</td>\n", " <td>[69.0, 70.0]</td>\n", " <td>[u'high', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " <td>high</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>rain</td>\n", " <td>75.0</td>\n", " <td>80.0</td>\n", " <td>[75.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " <td>medium</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>sunny</td>\n", " <td>75.0</td>\n", " <td>70.0</td>\n", " <td>[75.0, 70.0]</td>\n", " <td>[u'none', u'good']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " <td>none</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>overcast</td>\n", " <td>72.0</td>\n", " <td>90.0</td>\n", " <td>[72.0, 90.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " <td>medium</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>overcast</td>\n", " <td>81.0</td>\n", " <td>75.0</td>\n", " <td>[81.0, 75.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " <td>medium</td>\n", " <td>moderate</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>rain</td>\n", " <td>71.0</td>\n", " <td>80.0</td>\n", " <td>[71.0, 80.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " <td>low</td>\n", " <td>unhealthy</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0, u'none', u'unhealthy'),\n", " (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0, u'none', u'moderate'),\n", " (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5, u'low', u'moderate'),\n", " (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0, u'low', u'moderate'),\n", " (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, u'medium', u'good'),\n", " (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, u'low', u'unhealthy'),\n", " (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5, u'medium', u'moderate'),\n", " (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0, u'high', u'unhealthy'),\n", " (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0, u'high', u'good'),\n", " (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, u'medium', u'good'),\n", " (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0, u'none', u'good'),\n", " (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5, u'medium', u'moderate'),\n", " (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5, u'medium', u'moderate'),\n", " (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, u'low', u'unhealthy')]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS vec2cols_result;\n", "SELECT madlib.vec2cols(\n", " 'golf', -- source table\n", " 'vec2cols_result', -- output table\n", " 'clouds_airquality', -- column with array entries to split\n", " NULL, -- feature names\n", " '*' -- columns to keep from source table\n", ");\n", "SELECT * FROM vec2cols_result ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Get feature names from cols2vec summary table\n", "Now let's run cols2vec then reverse it using vec2cols. In this case we will get feature names from the cols2vec summary table.\n", "\n", "First run cols2vec:" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " <th>feature_vector</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " <td>[85.0, 85.0]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>80.0</td>\n", " <td>90.0</td>\n", " <td>[80.0, 90.0]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " <td>[83.0, 78.0]</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>70.0</td>\n", " <td>96.0</td>\n", " <td>[70.0, 96.0]</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " <td>[68.0, 80.0]</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>65.0</td>\n", " <td>70.0</td>\n", " <td>[65.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>64.0</td>\n", " <td>65.0</td>\n", " <td>[64.0, 65.0]</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>72.0</td>\n", " <td>95.0</td>\n", " <td>[72.0, 95.0]</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>69.0</td>\n", " <td>70.0</td>\n", " <td>[69.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>75.0</td>\n", " <td>80.0</td>\n", " <td>[75.0, 80.0]</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>75.0</td>\n", " <td>70.0</td>\n", " <td>[75.0, 70.0]</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>72.0</td>\n", " <td>90.0</td>\n", " <td>[72.0, 90.0]</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>81.0</td>\n", " <td>75.0</td>\n", " <td>[81.0, 75.0]</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>71.0</td>\n", " <td>80.0</td>\n", " <td>[71.0, 80.0]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, 85.0, 85.0, [85.0, 85.0]),\n", " (2, 80.0, 90.0, [80.0, 90.0]),\n", " (3, 83.0, 78.0, [83.0, 78.0]),\n", " (4, 70.0, 96.0, [70.0, 96.0]),\n", " (5, 68.0, 80.0, [68.0, 80.0]),\n", " (6, 65.0, 70.0, [65.0, 70.0]),\n", " (7, 64.0, 65.0, [64.0, 65.0]),\n", " (8, 72.0, 95.0, [72.0, 95.0]),\n", " (9, 69.0, 70.0, [69.0, 70.0]),\n", " (10, 75.0, 80.0, [75.0, 80.0]),\n", " (11, 75.0, 70.0, [75.0, 70.0]),\n", " (12, 72.0, 90.0, [72.0, 90.0]),\n", " (13, 81.0, 75.0, [81.0, 75.0]),\n", " (14, 71.0, 80.0, [71.0, 80.0])]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", "SELECT madlib.cols2vec(\n", " 'golf',\n", " 'cols2vec_result',\n", " 'temperature, humidity',\n", " NULL,\n", " 'id, temperature, humidity'\n", ");\n", "SELECT * FROM cols2vec_result ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View the summary table with the feature_names dictionary:" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>source_table</th>\n", " <th>list_of_features</th>\n", " <th>list_of_features_to_exclude</th>\n", " <th>feature_names</th>\n", " </tr>\n", " <tr>\n", " <td>golf</td>\n", " <td>temperature, humidity</td>\n", " <td>None</td>\n", " <td>[u'temperature', u'humidity']</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'golf', u'temperature, humidity', u'None', [u'temperature', u'humidity'])]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM cols2vec_result_summary;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now use feature_names from the summary table above to name the columns of the split array:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>80.0</td>\n", " <td>90.0</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>70.0</td>\n", " <td>96.0</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>65.0</td>\n", " <td>70.0</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>64.0</td>\n", " <td>65.0</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>72.0</td>\n", " <td>95.0</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>69.0</td>\n", " <td>70.0</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>75.0</td>\n", " <td>80.0</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>75.0</td>\n", " <td>70.0</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>72.0</td>\n", " <td>90.0</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>81.0</td>\n", " <td>75.0</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>71.0</td>\n", " <td>80.0</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, 85.0, 85.0),\n", " (2, 80.0, 90.0),\n", " (3, 83.0, 78.0),\n", " (4, 70.0, 96.0),\n", " (5, 68.0, 80.0),\n", " (6, 65.0, 70.0),\n", " (7, 64.0, 65.0),\n", " (8, 72.0, 95.0),\n", " (9, 69.0, 70.0),\n", " (10, 75.0, 80.0),\n", " (11, 75.0, 70.0),\n", " (12, 72.0, 90.0),\n", " (13, 81.0, 75.0),\n", " (14, 71.0, 80.0)]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS vec2cols_result;\n", "SELECT madlib.vec2cols(\n", " 'cols2vec_result', -- source table containing the feature vector\n", " 'vec2cols_result', -- output table\n", " 'feature_vector', -- column with array entries to split\n", " (SELECT feature_names from cols2vec_result_summary), -- feature_names from summary table of cols2vec\n", " 'id' -- columns to keep from source table\n", ");\n", "SELECT * FROM vec2cols_result ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is the same as the format of the original 'golf' dataset that we started with." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Drop columns\n", "Drop some columns from the source table." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "14 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>Temp_Humidity</th>\n", " <th>clouds_airquality</th>\n", " <th>windy</th>\n", " <th>class</th>\n", " <th>observation_weight</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[85.0, 85.0]</td>\n", " <td>[u'none', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[80.0, 90.0]</td>\n", " <td>[u'none', u'moderate']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[83.0, 78.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>[70.0, 96.0]</td>\n", " <td>[u'low', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>[68.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>[65.0, 70.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>[64.0, 65.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>[72.0, 95.0]</td>\n", " <td>[u'high', u'unhealthy']</td>\n", " <td>False</td>\n", " <td>Don't Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>[69.0, 70.0]</td>\n", " <td>[u'high', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>[75.0, 80.0]</td>\n", " <td>[u'medium', u'good']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>[75.0, 70.0]</td>\n", " <td>[u'none', u'good']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>[72.0, 90.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>True</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>[81.0, 75.0]</td>\n", " <td>[u'medium', u'moderate']</td>\n", " <td>False</td>\n", " <td>Play</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>[71.0, 80.0]</td>\n", " <td>[u'low', u'unhealthy']</td>\n", " <td>True</td>\n", " <td>Don't Play</td>\n", " <td>1.0</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", " (2, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0),\n", " (3, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5),\n", " (4, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0),\n", " (5, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", " (6, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0),\n", " (7, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", " (8, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", " (9, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0),\n", " (10, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", " (11, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0),\n", " (12, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", " (13, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5),\n", " (14, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0)]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS dropcols_out;\n", "\n", "SELECT madlib.dropcols(\n", " 'golf',\n", " 'dropcols_out',\n", " '\"OUTLOOK\", temperature, humidity'\n", ");\n", "\n", "SELECT * FROM dropcols_out ORDER BY id;" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }