community-artifacts/Sampling/Balanced-sampling-v1.ipynb (3,706 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Balanced sampling\n",
"\n",
"This module offers a number of re-sampling techniques including under-sampling majority classes, over-sampling minority classes, and combinations of the two.\n",
"\n",
"Balanced sampling was added in MADlib 1.14."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The sql extension is already loaded. To reload it, use:\n",
" %reload_ext sql\n"
]
}
],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"u'Connected: gpadmin@madlib'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
" \n",
"# PostgreSQL local\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
"\n",
"# Greenplum Database 4.3.10.0\n",
"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Load data\n",
"Based in part on the flags data set from https://archive.ics.uci.edu/ml/datasets/Flags"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"22 rows affected.\n",
"22 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>Mexico</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1973</td>\n",
" <td>77</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>Austria</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>84</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>Canada</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9976</td>\n",
" <td>24</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>China</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>9561</td>\n",
" <td>1008</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>Denmark</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>Luxembourg</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>Norway</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>324</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>Portugal</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>92</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>Spain</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>505</td>\n",
" <td>38</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>18</td>\n",
" <td>Switzerland</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>19</td>\n",
" <td>UK</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>245</td>\n",
" <td>56</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>21</td>\n",
" <td>xElba</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>22</td>\n",
" <td>xPrussia</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>249</td>\n",
" <td>61</td>\n",
" <td>4</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
" (3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
" (5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
" (6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
" (7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
" (12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
" (14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
" (15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
" (16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
" (18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
" (19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
" (10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (21, u'xElba', 3, 1, 1, 1, 6, None, None),\n",
" (22, u'xPrussia', 3, 1, 249, 61, 4, None, None)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS flags;\n",
"\n",
"CREATE TABLE flags (\n",
" id INTEGER,\n",
" name TEXT,\n",
" landmass INTEGER,\n",
" zone INTEGER,\n",
" area INTEGER,\n",
" population INTEGER,\n",
" language INTEGER,\n",
" colours INTEGER,\n",
" mainhue TEXT\n",
");\n",
"\n",
"INSERT INTO flags VALUES\n",
"(1, 'Argentina', 2, 3, 2777, 28, 2, 2, 'blue'),\n",
"(2, 'Australia', 6, 2, 7690, 15, 1, 3, 'blue'),\n",
"(3, 'Austria', 3, 1, 84, 8, 4, 2, 'red'),\n",
"(4, 'Brazil', 2, 3, 8512, 119, 6, 4, 'green'),\n",
"(5, 'Canada', 1, 4, 9976, 24, 1, 2, 'red'),\n",
"(6, 'China', 5, 1, 9561, 1008, 7, 2, 'red'),\n",
"(7, 'Denmark', 3, 1, 43, 5, 6, 2, 'red'),\n",
"(8, 'Greece', 3, 1, 132, 10, 6, 2, 'blue'),\n",
"(9, 'Guatemala', 1, 4, 109, 8, 2, 2, 'blue'),\n",
"(10, 'Ireland', 3, 4, 70, 3, 1, 3, 'white'),\n",
"(11, 'Jamaica', 1, 4, 11, 2, 1, 3, 'green'),\n",
"(12, 'Luxembourg', 3, 1, 3, 0, 4, 3, 'red'),\n",
"(13, 'Mexico', 1, 4, 1973, 77, 2, 4, 'green'),\n",
"(14, 'Norway', 3, 1, 324, 4, 6, 3, 'red'),\n",
"(15, 'Portugal', 3, 4, 92, 10, 6, 5, 'red'),\n",
"(16, 'Spain', 3, 4, 505, 38, 2, 2, 'red'),\n",
"(17, 'Sweden', 3, 1, 450, 8, 6, 2, 'blue'),\n",
"(18, 'Switzerland', 3, 1, 41, 6, 4, 2, 'red'),\n",
"(19, 'UK', 3, 4, 245, 56, 1, 3, 'red'),\n",
"(20, 'USA', 1, 4, 9363, 231, 1, 3, 'white'),\n",
"(21, 'xElba', 3, 1, 1, 1, 6, NULL, NULL),\n",
"(22, 'xPrussia', 3, 1, 249, 61, 4, NULL, NULL);\n",
"\n",
"SELECT * FROM flags ORDER BY mainhue, name;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Uniform sampling \n",
"\n",
"All class values will be resampled so that they have the same number of rows. The output data size will be the same as the input data size, ignoring NULL values. Uniform sampling is the default for the 'class_size' parameter so we do not need to explicitly set it: "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"20 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>19</td>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>18</td>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>20</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>13</td>\n",
" <td>Mexico</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1973</td>\n",
" <td>77</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>13</td>\n",
" <td>Mexico</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1973</td>\n",
" <td>77</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>13</td>\n",
" <td>Mexico</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1973</td>\n",
" <td>77</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>China</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>9561</td>\n",
" <td>1008</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" <td>Denmark</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>12</td>\n",
" <td>Luxembourg</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>18</td>\n",
" <td>Switzerland</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>19</td>\n",
" <td>UK</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>245</td>\n",
" <td>56</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(19L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (18L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (20L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (16L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (6L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (7L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
" (10L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
" (8L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
" (1L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
" (4L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
" (2L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
" (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
" (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
" (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (13L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue'); -- Class column\n",
" \n",
"SELECT * FROM output_table ORDER BY mainhue, name;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next we do uniform sampling again, but this time we specify a size for the output table:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"12 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>13</td>\n",
" <td>Mexico</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1973</td>\n",
" <td>77</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>Canada</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9976</td>\n",
" <td>24</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>12</td>\n",
" <td>Luxembourg</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>18</td>\n",
" <td>Switzerland</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(3L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (2L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (1L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (6L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (5L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
" (9L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
" (8L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
" (7L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
" (12L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (10L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue', -- Class column\n",
" 'uniform', -- Uniform sample\n",
" 12); -- Desired output table size\n",
"\n",
"SELECT * FROM output_table ORDER BY mainhue, name;"
]
},
{
"cell_type": "markdown",
"metadata": {
"scrolled": true
},
"source": [
"# 3. Oversampling\n",
"Oversample with replacement such that all class values except NULLs end up with the same number of observations as the majority class. Countries with red flags is the majority class with 10 observations, so other class values will be oversampled to 10 observations:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"40 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>37</td>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>39</td>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>38</td>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>40</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>32</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>31</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>35</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>33</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>36</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>34</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>18</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>19</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>20</td>\n",
" <td>13</td>\n",
" <td>Mexico</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1973</td>\n",
" <td>77</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>Austria</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>84</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>Canada</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9976</td>\n",
" <td>24</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>China</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>9561</td>\n",
" <td>1008</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>Denmark</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>Luxembourg</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>14</td>\n",
" <td>Norway</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>324</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>15</td>\n",
" <td>Portugal</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>92</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>16</td>\n",
" <td>Spain</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>505</td>\n",
" <td>38</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>18</td>\n",
" <td>Switzerland</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>19</td>\n",
" <td>UK</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>245</td>\n",
" <td>56</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>27</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>29</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>28</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>26</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>30</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>22</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>24</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>23</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>25</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>21</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(37L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (39L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (38L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (40L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (32L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (31L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (35L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (33L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (36L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (34L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (18L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (16L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (15L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (19L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (11L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (13L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (14L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (12L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (20L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
" (8L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
" (6L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
" (2L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
" (5L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
" (1L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
" (9L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
" (4L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
" (10L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
" (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
" (7L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
" (27L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (29L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (28L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (26L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (30L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (22L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (24L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (23L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (25L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (21L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue', -- Class column\n",
" 'oversample'); -- Oversample\n",
"\n",
"SELECT * FROM output_table ORDER BY mainhue, name;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Undersampling\n",
"Undersample such that all class values except NULLs end up with the same number of observations as the minority class. Countries with white flags is the minority class with 2 observations, so other class values will be undersampled to 2 observations:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"8 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>14</td>\n",
" <td>Norway</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>324</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>19</td>\n",
" <td>UK</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>245</td>\n",
" <td>56</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(2L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (1L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (3L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (6L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
" (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
" (7L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (8L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue', -- Class column\n",
" 'undersample'); -- Undersample\n",
"\n",
"SELECT * FROM output_table ORDER BY mainhue, name;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the case of bootstrapping, we may want to undersample with replacement, so we set the 'with_replacement' parameter to TRUE:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"8 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>Austria</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>84</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>Denmark</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(7L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (8L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (2L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (1L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (4L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
" (3L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
" (6L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (5L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue', -- Class column\n",
" 'undersample', -- Undersample\n",
" NULL, -- Output table size will be calculated\n",
" NULL, -- No grouping\n",
" 'TRUE'); -- Sample with replacement\n",
"\n",
"SELECT * FROM output_table ORDER BY mainhue, name;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note above that some rows may appear multiple times above since we sampled with replacement."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5. Setting class size by count\n",
"\n",
"Here we set the number of rows for red and blue flags, and leave green and white flags unchanged:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"19 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>13</td>\n",
" <td>Mexico</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1973</td>\n",
" <td>77</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>Austria</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>84</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>China</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>9561</td>\n",
" <td>1008</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>Denmark</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>12</td>\n",
" <td>Luxembourg</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>14</td>\n",
" <td>Norway</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>324</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>16</td>\n",
" <td>Spain</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>505</td>\n",
" <td>38</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>19</td>\n",
" <td>UK</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>245</td>\n",
" <td>56</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>18</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>19</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(11L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (12L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (10L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (16L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (14L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (13L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (15L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
" (2L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
" (3L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
" (1L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
" (6L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
" (7L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
" (4L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
" (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
" (18L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (19L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue', -- Class column\n",
" 'red=7, blue=7'); -- Want 7 reds and 7 blues\n",
"\n",
"SELECT * FROM output_table ORDER BY mainhue, name;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next we set the number of rows for red and blue flags, and also set an output table size. This means that green and white flags will be uniformly sampled to get to the desired output table size:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"22 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>21</td>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>22</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>19</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>20</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>18</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>6</td>\n",
" <td>China</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>9561</td>\n",
" <td>1008</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>Denmark</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>14</td>\n",
" <td>Norway</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>324</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>15</td>\n",
" <td>Portugal</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>92</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>16</td>\n",
" <td>Spain</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>505</td>\n",
" <td>38</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>18</td>\n",
" <td>Switzerland</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>19</td>\n",
" <td>UK</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>245</td>\n",
" <td>56</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(21L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (22L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (16L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (19L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (20L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (18L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (10L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (9L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (11L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (7L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
" (2L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
" (4L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
" (6L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
" (3L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
" (1L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
" (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
" (13L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue', -- Class column\n",
" 'red=7, blue=7', -- Want 7 reds and 7 blues\n",
" 22); -- Desired output table size\n",
"\n",
"SELECT * FROM output_table ORDER BY mainhue, name;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6. NULL handling\n",
"\n",
"To make NULL a valid class value, set the parameter to keep NULLs:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"25 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>24</td>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>25</td>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>22</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>21</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>23</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>13</td>\n",
" <td>Mexico</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1973</td>\n",
" <td>77</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>Canada</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9976</td>\n",
" <td>24</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>China</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>9561</td>\n",
" <td>1008</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>14</td>\n",
" <td>Norway</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>324</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>15</td>\n",
" <td>Portugal</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>92</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>19</td>\n",
" <td>UK</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>245</td>\n",
" <td>56</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" <td>21</td>\n",
" <td>xElba</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>21</td>\n",
" <td>xElba</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>20</td>\n",
" <td>22</td>\n",
" <td>xPrussia</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>249</td>\n",
" <td>61</td>\n",
" <td>4</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>18</td>\n",
" <td>22</td>\n",
" <td>xPrussia</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>249</td>\n",
" <td>61</td>\n",
" <td>4</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>19</td>\n",
" <td>22</td>\n",
" <td>xPrussia</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>249</td>\n",
" <td>61</td>\n",
" <td>4</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(24L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (25L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (22L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (21L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (23L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (7L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (8L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (6L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (10L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
" (2L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
" (3L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
" (4L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
" (5L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
" (1L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
" (11L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (12L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (14L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (13L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (15L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (17L, 21, u'xElba', 3, 1, 1, 1, 6, None, None),\n",
" (16L, 21, u'xElba', 3, 1, 1, 1, 6, None, None),\n",
" (20L, 22, u'xPrussia', 3, 1, 249, 61, 4, None, None),\n",
" (18L, 22, u'xPrussia', 3, 1, 249, 61, 4, None, None),\n",
" (19L, 22, u'xPrussia', 3, 1, 249, 61, 4, None, None)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue', -- Class column\n",
" NULL, -- Uniform\n",
" NULL, -- Output table size same as input\n",
" NULL, -- No grouping\n",
" NULL, -- Sample without replacement\n",
" 'TRUE'); -- Make NULLs a valid class value\n",
"\n",
"SELECT * FROM output_table ORDER BY mainhue, name;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 7. Grouping\n",
"To perform the balance sampling for independent groups, use the 'grouping_cols' parameter. Note below that each group (zone) has a different count of the classes (mainhue), with some groups not containing some class values."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"23 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>12</td>\n",
" <td>Luxembourg</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>18</td>\n",
" <td>Switzerland</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>14</td>\n",
" <td>Norway</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>324</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>China</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>9561</td>\n",
" <td>1008</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>Canada</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9976</td>\n",
" <td>24</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>16</td>\n",
" <td>Spain</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>505</td>\n",
" <td>38</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>15</td>\n",
" <td>Portugal</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>92</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (3L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (2L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (4L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (8L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
" (6L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
" (7L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
" (5L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
" (1L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (1L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (2L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (2L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (3L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (1L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (9L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (7L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (6L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
" (4L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
" (5L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
" (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n",
" (10L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue', -- Class column\n",
" NULL, -- Uniform\n",
" NULL, -- Output table size\n",
" 'zone' -- Grouping by zone\n",
");\n",
"\n",
"SELECT * FROM output_table ORDER BY zone, mainhue;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Grouping can be used with class size specification as well. Note below that 'blue=<Integer>' is the only valid class value since 'blue' is the only class value that is present in each group. Further, blue=8` in the example below will be split between the four groups, resulting in two blue rows for each group."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"23 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__madlib_id__</th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>landmass</th>\n",
" <th>zone</th>\n",
" <th>area</th>\n",
" <th>population</th>\n",
" <th>language</th>\n",
" <th>colours</th>\n",
" <th>mainhue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>17</td>\n",
" <td>Sweden</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>450</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>8</td>\n",
" <td>Greece</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>132</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>Denmark</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>14</td>\n",
" <td>Norway</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>324</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>18</td>\n",
" <td>Switzerland</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>12</td>\n",
" <td>Luxembourg</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Austria</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>84</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>China</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>9561</td>\n",
" <td>1008</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Australia</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>7690</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Argentina</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2777</td>\n",
" <td>28</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>Brazil</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8512</td>\n",
" <td>119</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>9</td>\n",
" <td>Guatemala</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>109</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>blue</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>11</td>\n",
" <td>Jamaica</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>13</td>\n",
" <td>Mexico</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1973</td>\n",
" <td>77</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>green</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>19</td>\n",
" <td>UK</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>245</td>\n",
" <td>56</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>16</td>\n",
" <td>Spain</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>505</td>\n",
" <td>38</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>Canada</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9976</td>\n",
" <td>24</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>15</td>\n",
" <td>Portugal</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>92</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>red</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>10</td>\n",
" <td>Ireland</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" <td>USA</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>9363</td>\n",
" <td>231</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>white</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(3L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n",
" (4L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n",
" (2L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n",
" (7L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n",
" (8L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n",
" (6L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n",
" (1L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n",
" (5L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n",
" (1L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (2L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n",
" (1L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (2L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n",
" (3L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n",
" (7L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (6L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n",
" (2L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n",
" (3L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n",
" (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n",
" (9L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n",
" (1L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n",
" (4L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n",
" (8L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n",
" (10L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS output_table;\n",
"\n",
"SELECT madlib.balance_sample(\n",
" 'flags', -- Source table\n",
" 'output_table', -- Output table\n",
" 'mainhue', -- Class column\n",
" 'blue=8', -- Specified class value size. Rest of the values are outputed as is.\n",
" NULL, -- Output table size\n",
" 'zone' -- Group by zone\n",
");\n",
"\n",
"SELECT * FROM output_table ORDER BY zone, mainhue;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}