community-artifacts/Sampling/Stratified-sampling-v2.ipynb (672 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Stratified sampling\n",
"Stratified sampling is a method for sampling subpopulations (strata) independently. It is commonly used to reduce sampling error by ensuring that subgroups are adequately represented in the sample.\n",
"\n",
"Stratified sampling was added in MADlib 1.12."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The sql extension is already loaded. To reload it, use:\n",
" %reload_ext sql\n"
]
}
],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"u'Connected: gpdbchina@madlib'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
"#%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
" \n",
"# PostgreSQL local\n",
"%sql postgresql://fmcquillan@localhost:5432/madlib\n",
"\n",
"# Greenplum Database 4.3.10.0\n",
"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Create input table"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"25 rows affected.\n",
"25 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id1</th>\n",
" <th>id2</th>\n",
" <th>gr1</th>\n",
" <th>gr2</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>20</td>\n",
" <td>20</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>30</td>\n",
" <td>30</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>40</td>\n",
" <td>40</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>50</td>\n",
" <td>50</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>60</td>\n",
" <td>60</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>70</td>\n",
" <td>70</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 0, 1, 1),\n",
" (2, 0, 1, 1),\n",
" (3, 0, 1, 1),\n",
" (4, 0, 1, 1),\n",
" (5, 0, 1, 1),\n",
" (6, 0, 1, 1),\n",
" (7, 0, 1, 1),\n",
" (8, 0, 1, 1),\n",
" (9, 0, 1, 1),\n",
" (9, 0, 1, 1),\n",
" (9, 0, 1, 1),\n",
" (9, 0, 1, 1),\n",
" (0, 1, 1, 2),\n",
" (0, 2, 1, 2),\n",
" (0, 3, 1, 2),\n",
" (0, 4, 1, 2),\n",
" (0, 5, 1, 2),\n",
" (0, 6, 1, 2),\n",
" (10, 10, 2, 2),\n",
" (20, 20, 2, 2),\n",
" (30, 30, 2, 2),\n",
" (40, 40, 2, 2),\n",
" (50, 50, 2, 2),\n",
" (60, 60, 2, 2),\n",
" (70, 70, 2, 2)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"DROP TABLE IF EXISTS test;\n",
"\n",
"CREATE TABLE test(\n",
" id1 INTEGER,\n",
" id2 INTEGER,\n",
" gr1 INTEGER,\n",
" gr2 INTEGER\n",
");\n",
"\n",
"INSERT INTO test VALUES\n",
"(1,0,1,1),\n",
"(2,0,1,1),\n",
"(3,0,1,1),\n",
"(4,0,1,1),\n",
"(5,0,1,1),\n",
"(6,0,1,1),\n",
"(7,0,1,1),\n",
"(8,0,1,1),\n",
"(9,0,1,1),\n",
"(9,0,1,1),\n",
"(9,0,1,1),\n",
"(9,0,1,1),\n",
"(0,1,1,2),\n",
"(0,2,1,2),\n",
"(0,3,1,2),\n",
"(0,4,1,2),\n",
"(0,5,1,2),\n",
"(0,6,1,2),\n",
"(10,10,2,2),\n",
"(20,20,2,2),\n",
"(30,30,2,2),\n",
"(40,40,2,2),\n",
"(50,50,2,2),\n",
"(60,60,2,2),\n",
"(70,70,2,2);\n",
"\n",
"SELECT * FROM test ORDER BY gr1, gr2, id1, id2;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Sample without replacement"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"13 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>gr1</th>\n",
" <th>gr2</th>\n",
" <th>id1</th>\n",
" <th>id2</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>30</td>\n",
" <td>30</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>40</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>50</td>\n",
" <td>50</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>70</td>\n",
" <td>70</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 1, 1, 0),\n",
" (1, 1, 2, 0),\n",
" (1, 1, 3, 0),\n",
" (1, 1, 6, 0),\n",
" (1, 1, 9, 0),\n",
" (1, 1, 9, 0),\n",
" (1, 2, 0, 1),\n",
" (1, 2, 0, 2),\n",
" (1, 2, 0, 5),\n",
" (2, 2, 30, 30),\n",
" (2, 2, 40, 40),\n",
" (2, 2, 50, 50),\n",
" (2, 2, 70, 70)]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS out;\n",
"\n",
"SELECT madlib.stratified_sample(\n",
" 'test', -- Source table\n",
" 'out', -- Output table\n",
" 0.5, -- Sample proportion\n",
" 'gr1,gr2', -- Strata definition\n",
" 'id1,id2', -- Columns to output\n",
" FALSE); -- Sample without replacement\n",
"\n",
"SELECT * FROM out ORDER BY gr1,gr2,id1,id2;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Sample with replacement"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"13 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>gr1</th>\n",
" <th>gr2</th>\n",
" <th>id1</th>\n",
" <th>id2</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>20</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>40</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>40</td>\n",
" <td>40</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 1, 1, 0),\n",
" (1, 1, 2, 0),\n",
" (1, 1, 3, 0),\n",
" (1, 1, 4, 0),\n",
" (1, 1, 8, 0),\n",
" (1, 1, 9, 0),\n",
" (1, 2, 0, 5),\n",
" (1, 2, 0, 5),\n",
" (1, 2, 0, 6),\n",
" (2, 2, 20, 20),\n",
" (2, 2, 20, 20),\n",
" (2, 2, 40, 40),\n",
" (2, 2, 40, 40)]"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS out;\n",
"\n",
"SELECT madlib.stratified_sample(\n",
" 'test', -- Source table\n",
" 'out', -- Output table\n",
" 0.5, -- Sample proportion\n",
" 'gr1,gr2', -- Strata definition\n",
" 'id1,id2', -- Columns to output\n",
" TRUE); -- Sample with replacement\n",
"\n",
"SELECT * FROM out ORDER BY gr1,gr2,id1,id2;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}