community-artifacts/Sampling/Balanced-sampling-v1.ipynb (3,706 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Balanced sampling\n", "\n", "This module offers a number of re-sampling techniques including under-sampling majority classes, over-sampling minority classes, and combinations of the two.\n", "\n", "Balanced sampling was added in MADlib 1.14." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The sql extension is already loaded. To reload it, use:\n", " %reload_ext sql\n" ] } ], "source": [ "%load_ext sql" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "u'Connected: gpadmin@madlib'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Greenplum Database 5.4.0 on GCP (demo machine)\n", "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", " \n", "# PostgreSQL local\n", "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", "\n", "# Greenplum Database 4.3.10.0\n", "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>version</th>\n", " </tr>\n", " <tr>\n", " <td>MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql select madlib.version();\n", "#%sql select version();" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Load data\n", "Based in part on the flags data set from https://archive.ics.uci.edu/ml/datasets/Flags" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "22 rows affected.\n", "22 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>Mexico</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>1973</td>\n", " <td>77</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>Austria</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>84</td>\n", " <td>8</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>Canada</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9976</td>\n", " <td>24</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>China</td>\n", " <td>5</td>\n", " <td>1</td>\n", " <td>9561</td>\n", " <td>1008</td>\n", " <td>7</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>Denmark</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>43</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>Luxembourg</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>Norway</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>324</td>\n", " <td>4</td>\n", " <td>6</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>Portugal</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>92</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>Spain</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>505</td>\n", " <td>38</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>18</td>\n", " <td>Switzerland</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>41</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>19</td>\n", " <td>UK</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>245</td>\n", " <td>56</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>21</td>\n", " <td>xElba</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>6</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>22</td>\n", " <td>xPrussia</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>249</td>\n", " <td>61</td>\n", " <td>4</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", " (3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n", " (5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n", " (6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", " (7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", " (12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", " (14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", " (15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n", " (16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", " (18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", " (19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", " (10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (21, u'xElba', 3, 1, 1, 1, 6, None, None),\n", " (22, u'xPrussia', 3, 1, 249, 61, 4, None, None)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS flags;\n", "\n", "CREATE TABLE flags (\n", " id INTEGER,\n", " name TEXT,\n", " landmass INTEGER,\n", " zone INTEGER,\n", " area INTEGER,\n", " population INTEGER,\n", " language INTEGER,\n", " colours INTEGER,\n", " mainhue TEXT\n", ");\n", "\n", "INSERT INTO flags VALUES\n", "(1, 'Argentina', 2, 3, 2777, 28, 2, 2, 'blue'),\n", "(2, 'Australia', 6, 2, 7690, 15, 1, 3, 'blue'),\n", "(3, 'Austria', 3, 1, 84, 8, 4, 2, 'red'),\n", "(4, 'Brazil', 2, 3, 8512, 119, 6, 4, 'green'),\n", "(5, 'Canada', 1, 4, 9976, 24, 1, 2, 'red'),\n", "(6, 'China', 5, 1, 9561, 1008, 7, 2, 'red'),\n", "(7, 'Denmark', 3, 1, 43, 5, 6, 2, 'red'),\n", "(8, 'Greece', 3, 1, 132, 10, 6, 2, 'blue'),\n", "(9, 'Guatemala', 1, 4, 109, 8, 2, 2, 'blue'),\n", "(10, 'Ireland', 3, 4, 70, 3, 1, 3, 'white'),\n", "(11, 'Jamaica', 1, 4, 11, 2, 1, 3, 'green'),\n", "(12, 'Luxembourg', 3, 1, 3, 0, 4, 3, 'red'),\n", "(13, 'Mexico', 1, 4, 1973, 77, 2, 4, 'green'),\n", "(14, 'Norway', 3, 1, 324, 4, 6, 3, 'red'),\n", "(15, 'Portugal', 3, 4, 92, 10, 6, 5, 'red'),\n", "(16, 'Spain', 3, 4, 505, 38, 2, 2, 'red'),\n", "(17, 'Sweden', 3, 1, 450, 8, 6, 2, 'blue'),\n", "(18, 'Switzerland', 3, 1, 41, 6, 4, 2, 'red'),\n", "(19, 'UK', 3, 4, 245, 56, 1, 3, 'red'),\n", "(20, 'USA', 1, 4, 9363, 231, 1, 3, 'white'),\n", "(21, 'xElba', 3, 1, 1, 1, 6, NULL, NULL),\n", "(22, 'xPrussia', 3, 1, 249, 61, 4, NULL, NULL);\n", "\n", "SELECT * FROM flags ORDER BY mainhue, name;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Uniform sampling \n", "\n", "All class values will be resampled so that they have the same number of rows. The output data size will be the same as the input data size, ignoring NULL values. Uniform sampling is the default for the 'class_size' parameter so we do not need to explicitly set it: " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "20 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>19</td>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>18</td>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>20</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>17</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>13</td>\n", " <td>Mexico</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>1973</td>\n", " <td>77</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>13</td>\n", " <td>Mexico</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>1973</td>\n", " <td>77</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>13</td>\n", " <td>Mexico</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>1973</td>\n", " <td>77</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>6</td>\n", " <td>China</td>\n", " <td>5</td>\n", " <td>1</td>\n", " <td>9561</td>\n", " <td>1008</td>\n", " <td>7</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>7</td>\n", " <td>Denmark</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>43</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>12</td>\n", " <td>Luxembourg</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>18</td>\n", " <td>Switzerland</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>41</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>19</td>\n", " <td>UK</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>245</td>\n", " <td>56</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(19L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (18L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (20L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (16L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (6L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (7L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", " (10L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", " (8L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", " (1L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", " (4L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", " (2L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", " (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", " (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (13L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue'); -- Class column\n", " \n", "SELECT * FROM output_table ORDER BY mainhue, name;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next we do uniform sampling again, but this time we specify a size for the output table:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "12 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>13</td>\n", " <td>Mexico</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>1973</td>\n", " <td>77</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>5</td>\n", " <td>Canada</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9976</td>\n", " <td>24</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>12</td>\n", " <td>Luxembourg</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>18</td>\n", " <td>Switzerland</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>41</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(3L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (2L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (1L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (6L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (5L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", " (9L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n", " (8L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", " (7L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", " (12L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (10L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue', -- Class column\n", " 'uniform', -- Uniform sample\n", " 12); -- Desired output table size\n", "\n", "SELECT * FROM output_table ORDER BY mainhue, name;" ] }, { "cell_type": "markdown", "metadata": { "scrolled": true }, "source": [ "# 3. Oversampling\n", "Oversample with replacement such that all class values except NULLs end up with the same number of observations as the majority class. Countries with red flags is the majority class with 10 observations, so other class values will be oversampled to 10 observations:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "40 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>37</td>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>39</td>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>38</td>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>40</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>32</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>31</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>35</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>33</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>36</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>34</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>18</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>17</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>19</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>20</td>\n", " <td>13</td>\n", " <td>Mexico</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>1973</td>\n", " <td>77</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>3</td>\n", " <td>Austria</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>84</td>\n", " <td>8</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>Canada</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9976</td>\n", " <td>24</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>6</td>\n", " <td>China</td>\n", " <td>5</td>\n", " <td>1</td>\n", " <td>9561</td>\n", " <td>1008</td>\n", " <td>7</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>7</td>\n", " <td>Denmark</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>43</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>12</td>\n", " <td>Luxembourg</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>14</td>\n", " <td>Norway</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>324</td>\n", " <td>4</td>\n", " <td>6</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>15</td>\n", " <td>Portugal</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>92</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>16</td>\n", " <td>Spain</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>505</td>\n", " <td>38</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>18</td>\n", " <td>Switzerland</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>41</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>19</td>\n", " <td>UK</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>245</td>\n", " <td>56</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>27</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>29</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>28</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>26</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>30</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>22</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>24</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>23</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>25</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>21</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(37L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (39L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (38L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (40L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (32L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (31L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (35L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (33L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (36L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (34L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (18L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (16L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (15L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (19L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (11L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (13L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (14L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (12L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (20L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", " (8L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n", " (6L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n", " (2L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", " (5L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", " (1L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", " (9L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", " (4L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n", " (10L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", " (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", " (7L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", " (27L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (29L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (28L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (26L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (30L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (22L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (24L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (23L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (25L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (21L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue', -- Class column\n", " 'oversample'); -- Oversample\n", "\n", "SELECT * FROM output_table ORDER BY mainhue, name;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Undersampling\n", "Undersample such that all class values except NULLs end up with the same number of observations as the minority class. Countries with white flags is the minority class with 2 observations, so other class values will be undersampled to 2 observations:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "8 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>14</td>\n", " <td>Norway</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>324</td>\n", " <td>4</td>\n", " <td>6</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>19</td>\n", " <td>UK</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>245</td>\n", " <td>56</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(2L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (1L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (3L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (6L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", " (7L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (8L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue', -- Class column\n", " 'undersample'); -- Undersample\n", "\n", "SELECT * FROM output_table ORDER BY mainhue, name;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the case of bootstrapping, we may want to undersample with replacement, so we set the 'with_replacement' parameter to TRUE:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "8 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>Austria</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>84</td>\n", " <td>8</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>7</td>\n", " <td>Denmark</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>43</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(7L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (8L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (2L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (1L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (4L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n", " (3L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", " (6L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (5L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue', -- Class column\n", " 'undersample', -- Undersample\n", " NULL, -- Output table size will be calculated\n", " NULL, -- No grouping\n", " 'TRUE'); -- Sample with replacement\n", "\n", "SELECT * FROM output_table ORDER BY mainhue, name;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note above that some rows may appear multiple times above since we sampled with replacement." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Setting class size by count\n", "\n", "Here we set the number of rows for red and blue flags, and leave green and white flags unchanged:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "19 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>17</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>13</td>\n", " <td>Mexico</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>1973</td>\n", " <td>77</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>Austria</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>84</td>\n", " <td>8</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>6</td>\n", " <td>China</td>\n", " <td>5</td>\n", " <td>1</td>\n", " <td>9561</td>\n", " <td>1008</td>\n", " <td>7</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>7</td>\n", " <td>Denmark</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>43</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>12</td>\n", " <td>Luxembourg</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>14</td>\n", " <td>Norway</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>324</td>\n", " <td>4</td>\n", " <td>6</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>16</td>\n", " <td>Spain</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>505</td>\n", " <td>38</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>19</td>\n", " <td>UK</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>245</td>\n", " <td>56</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>18</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>19</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(11L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (12L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (10L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (16L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (14L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (13L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (15L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", " (2L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n", " (3L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", " (1L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", " (6L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", " (7L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", " (4L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", " (18L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (19L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue', -- Class column\n", " 'red=7, blue=7'); -- Want 7 reds and 7 blues\n", "\n", "SELECT * FROM output_table ORDER BY mainhue, name;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next we set the number of rows for red and blue flags, and also set an output table size. This means that green and white flags will be uniformly sampled to get to the desired output table size:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "22 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>21</td>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>22</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>17</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>19</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>20</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>18</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>6</td>\n", " <td>China</td>\n", " <td>5</td>\n", " <td>1</td>\n", " <td>9561</td>\n", " <td>1008</td>\n", " <td>7</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>7</td>\n", " <td>Denmark</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>43</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>14</td>\n", " <td>Norway</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>324</td>\n", " <td>4</td>\n", " <td>6</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>15</td>\n", " <td>Portugal</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>92</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>16</td>\n", " <td>Spain</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>505</td>\n", " <td>38</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>18</td>\n", " <td>Switzerland</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>41</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>19</td>\n", " <td>UK</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>245</td>\n", " <td>56</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(21L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (22L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (16L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (19L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (20L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (18L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (10L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (9L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (11L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (7L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", " (2L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", " (4L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", " (6L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n", " (3L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", " (1L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", " (13L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue', -- Class column\n", " 'red=7, blue=7', -- Want 7 reds and 7 blues\n", " 22); -- Desired output table size\n", "\n", "SELECT * FROM output_table ORDER BY mainhue, name;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. NULL handling\n", "\n", "To make NULL a valid class value, set the parameter to keep NULLs:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "25 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>24</td>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>25</td>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>22</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>21</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>23</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>13</td>\n", " <td>Mexico</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>1973</td>\n", " <td>77</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>5</td>\n", " <td>Canada</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9976</td>\n", " <td>24</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>6</td>\n", " <td>China</td>\n", " <td>5</td>\n", " <td>1</td>\n", " <td>9561</td>\n", " <td>1008</td>\n", " <td>7</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>14</td>\n", " <td>Norway</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>324</td>\n", " <td>4</td>\n", " <td>6</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>15</td>\n", " <td>Portugal</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>92</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>19</td>\n", " <td>UK</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>245</td>\n", " <td>56</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>17</td>\n", " <td>21</td>\n", " <td>xElba</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>6</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>21</td>\n", " <td>xElba</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>6</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>20</td>\n", " <td>22</td>\n", " <td>xPrussia</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>249</td>\n", " <td>61</td>\n", " <td>4</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>18</td>\n", " <td>22</td>\n", " <td>xPrussia</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>249</td>\n", " <td>61</td>\n", " <td>4</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>19</td>\n", " <td>22</td>\n", " <td>xPrussia</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>249</td>\n", " <td>61</td>\n", " <td>4</td>\n", " <td>None</td>\n", " <td>None</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(24L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (25L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (22L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (21L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (23L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (7L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (8L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (6L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (10L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", " (2L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n", " (3L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", " (4L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", " (5L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n", " (1L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", " (11L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (12L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (14L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (13L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (15L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (17L, 21, u'xElba', 3, 1, 1, 1, 6, None, None),\n", " (16L, 21, u'xElba', 3, 1, 1, 1, 6, None, None),\n", " (20L, 22, u'xPrussia', 3, 1, 249, 61, 4, None, None),\n", " (18L, 22, u'xPrussia', 3, 1, 249, 61, 4, None, None),\n", " (19L, 22, u'xPrussia', 3, 1, 249, 61, 4, None, None)]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue', -- Class column\n", " NULL, -- Uniform\n", " NULL, -- Output table size same as input\n", " NULL, -- No grouping\n", " NULL, -- Sample without replacement\n", " 'TRUE'); -- Make NULLs a valid class value\n", "\n", "SELECT * FROM output_table ORDER BY mainhue, name;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7. Grouping\n", "To perform the balance sampling for independent groups, use the 'grouping_cols' parameter. Note below that each group (zone) has a different count of the classes (mainhue), with some groups not containing some class values." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "23 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>12</td>\n", " <td>Luxembourg</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>18</td>\n", " <td>Switzerland</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>41</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>14</td>\n", " <td>Norway</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>324</td>\n", " <td>4</td>\n", " <td>6</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>China</td>\n", " <td>5</td>\n", " <td>1</td>\n", " <td>9561</td>\n", " <td>1008</td>\n", " <td>7</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>Canada</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9976</td>\n", " <td>24</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>16</td>\n", " <td>Spain</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>505</td>\n", " <td>38</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>15</td>\n", " <td>Portugal</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>92</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (3L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (2L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (4L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (8L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", " (6L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", " (7L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", " (5L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", " (1L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (1L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (2L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (2L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (3L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (1L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (9L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (7L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (6L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n", " (4L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", " (5L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n", " (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", " (10L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue', -- Class column\n", " NULL, -- Uniform\n", " NULL, -- Output table size\n", " 'zone' -- Grouping by zone\n", ");\n", "\n", "SELECT * FROM output_table ORDER BY zone, mainhue;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Grouping can be used with class size specification as well. Note below that 'blue=<Integer>' is the only valid class value since 'blue' is the only class value that is present in each group. Further, blue=8` in the example below will be split between the four groups, resulting in two blue rows for each group." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "23 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>__madlib_id__</th>\n", " <th>id</th>\n", " <th>name</th>\n", " <th>landmass</th>\n", " <th>zone</th>\n", " <th>area</th>\n", " <th>population</th>\n", " <th>language</th>\n", " <th>colours</th>\n", " <th>mainhue</th>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>17</td>\n", " <td>Sweden</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>450</td>\n", " <td>8</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>8</td>\n", " <td>Greece</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>132</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>7</td>\n", " <td>Denmark</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>43</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>14</td>\n", " <td>Norway</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>324</td>\n", " <td>4</td>\n", " <td>6</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>18</td>\n", " <td>Switzerland</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>41</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>12</td>\n", " <td>Luxembourg</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>Austria</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>84</td>\n", " <td>8</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>China</td>\n", " <td>5</td>\n", " <td>1</td>\n", " <td>9561</td>\n", " <td>1008</td>\n", " <td>7</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>Australia</td>\n", " <td>6</td>\n", " <td>2</td>\n", " <td>7690</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>Argentina</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>2777</td>\n", " <td>28</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>Brazil</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>8512</td>\n", " <td>119</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>9</td>\n", " <td>Guatemala</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>109</td>\n", " <td>8</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>blue</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>11</td>\n", " <td>Jamaica</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>11</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>13</td>\n", " <td>Mexico</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>1973</td>\n", " <td>77</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>green</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>19</td>\n", " <td>UK</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>245</td>\n", " <td>56</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>16</td>\n", " <td>Spain</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>505</td>\n", " <td>38</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>5</td>\n", " <td>Canada</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9976</td>\n", " <td>24</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>15</td>\n", " <td>Portugal</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>92</td>\n", " <td>10</td>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>red</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>10</td>\n", " <td>Ireland</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>70</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>20</td>\n", " <td>USA</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>9363</td>\n", " <td>231</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>white</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(3L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", " (4L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", " (2L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", " (7L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", " (8L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", " (6L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", " (1L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n", " (5L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", " (1L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (2L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", " (1L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (2L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", " (3L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", " (7L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (6L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", " (2L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", " (3L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", " (9L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", " (1L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n", " (4L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n", " (8L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", " (10L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS output_table;\n", "\n", "SELECT madlib.balance_sample(\n", " 'flags', -- Source table\n", " 'output_table', -- Output table\n", " 'mainhue', -- Class column\n", " 'blue=8', -- Specified class value size. Rest of the values are outputed as is.\n", " NULL, -- Output table size\n", " 'zone' -- Group by zone\n", ");\n", "\n", "SELECT * FROM output_table ORDER BY zone, mainhue;" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }