community-artifacts/Supervised-learning/Linear-regression-v1.ipynb (1,166 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Linear regression"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Greenplum Database 5.x on GCP (PM demo machine) - via tunnel\n",
"%sql postgresql://gpadmin@localhost:8000/madlib\n",
" \n",
"# PostgreSQL local\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-100-g4987e8f, cmake configuration time: Wed Mar 24 23:51:47 UTC 2021, build type: release, build system: Linux-3.10.0-1160.21.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-100-g4987e8f, cmake configuration time: Wed Mar 24 23:51:47 UTC 2021, build type: release, build system: Linux-3.10.0-1160.21.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5',)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Load test data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"15 rows affected.\n",
"15 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>tax</th>\n",
" <th>bedroom</th>\n",
" <th>bath</th>\n",
" <th>price</th>\n",
" <th>size</th>\n",
" <th>lot</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>590</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>50000</td>\n",
" <td>770</td>\n",
" <td>22100</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1050</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>85000</td>\n",
" <td>1410</td>\n",
" <td>12000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>20</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>22500</td>\n",
" <td>1060</td>\n",
" <td>3500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>870</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>90000</td>\n",
" <td>1300</td>\n",
" <td>17500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>1320</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>133000</td>\n",
" <td>1500</td>\n",
" <td>30000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>1350</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>90500</td>\n",
" <td>820</td>\n",
" <td>25700</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>2790</td>\n",
" <td>3</td>\n",
" <td>2.5</td>\n",
" <td>260000</td>\n",
" <td>2130</td>\n",
" <td>25000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>680</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>142500</td>\n",
" <td>1170</td>\n",
" <td>22000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>1840</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>160000</td>\n",
" <td>1500</td>\n",
" <td>19000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>3680</td>\n",
" <td>4</td>\n",
" <td>2.0</td>\n",
" <td>240000</td>\n",
" <td>2790</td>\n",
" <td>20000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>1660</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>87000</td>\n",
" <td>1030</td>\n",
" <td>17500</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>1620</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>118600</td>\n",
" <td>1250</td>\n",
" <td>20000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>3100</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>140000</td>\n",
" <td>1760</td>\n",
" <td>38000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>2070</td>\n",
" <td>2</td>\n",
" <td>3.0</td>\n",
" <td>148000</td>\n",
" <td>1550</td>\n",
" <td>14000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>650</td>\n",
" <td>3</td>\n",
" <td>1.5</td>\n",
" <td>65000</td>\n",
" <td>1450</td>\n",
" <td>12000</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 590, 2, 1.0, 50000, 770, 22100),\n",
" (2, 1050, 3, 2.0, 85000, 1410, 12000),\n",
" (3, 20, 3, 1.0, 22500, 1060, 3500),\n",
" (4, 870, 2, 2.0, 90000, 1300, 17500),\n",
" (5, 1320, 3, 2.0, 133000, 1500, 30000),\n",
" (6, 1350, 2, 1.0, 90500, 820, 25700),\n",
" (7, 2790, 3, 2.5, 260000, 2130, 25000),\n",
" (8, 680, 2, 1.0, 142500, 1170, 22000),\n",
" (9, 1840, 3, 2.0, 160000, 1500, 19000),\n",
" (10, 3680, 4, 2.0, 240000, 2790, 20000),\n",
" (11, 1660, 3, 1.0, 87000, 1030, 17500),\n",
" (12, 1620, 3, 2.0, 118600, 1250, 20000),\n",
" (13, 3100, 3, 2.0, 140000, 1760, 38000),\n",
" (14, 2070, 2, 3.0, 148000, 1550, 14000),\n",
" (15, 650, 3, 1.5, 65000, 1450, 12000)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"DROP TABLE IF EXISTS houses;\n",
"\n",
"CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,\n",
" size INT, lot INT);\n",
"\n",
"INSERT INTO houses VALUES \n",
" (1 , 590 , 2 , 1 , 50000 , 770 , 22100),\n",
" (2 , 1050 , 3 , 2 , 85000 , 1410 , 12000),\n",
" (3 , 20 , 3 , 1 , 22500 , 1060 , 3500),\n",
" (4 , 870 , 2 , 2 , 90000 , 1300 , 17500),\n",
" (5 , 1320 , 3 , 2 , 133000 , 1500 , 30000),\n",
" (6 , 1350 , 2 , 1 , 90500 , 820 , 25700),\n",
" (7 , 2790 , 3 , 2.5 , 260000 , 2130 , 25000),\n",
" (8 , 680 , 2 , 1 , 142500 , 1170 , 22000),\n",
" (9 , 1840 , 3 , 2 , 160000 , 1500 , 19000),\n",
" (10 , 3680 , 4 , 2 , 240000 , 2790 , 20000),\n",
" (11 , 1660 , 3 , 1 , 87000 , 1030 , 17500),\n",
" (12 , 1620 , 3 , 2 , 118600 , 1250 , 20000),\n",
" (13 , 3100 , 3 , 2 , 140000 , 1760 , 38000),\n",
" (14 , 2070 , 2 , 3 , 148000 , 1550 , 14000),\n",
" (15 , 650 , 3 , 1.5 , 65000 , 1450 , 12000);\n",
" \n",
"SELECT * FROM houses ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Train regression model\n",
"First, we generate a single regression for all data."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>method</th>\n",
" <th>source_table</th>\n",
" <th>out_table</th>\n",
" <th>dependent_varname</th>\n",
" <th>independent_varname</th>\n",
" <th>num_rows_processed</th>\n",
" <th>num_missing_rows_skipped</th>\n",
" <th>grouping_col</th>\n",
" </tr>\n",
" <tr>\n",
" <td>linregr</td>\n",
" <td>houses</td>\n",
" <td>houses_linregr</td>\n",
" <td>price</td>\n",
" <td>ARRAY[1, tax, bath, size]</td>\n",
" <td>15</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'linregr', u'houses', u'houses_linregr', u'price', u'ARRAY[1, tax, bath, size]', 15L, 0, None)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS houses_linregr, houses_linregr_summary;\n",
"\n",
"SELECT madlib.linregr_train( 'houses',\n",
" 'houses_linregr',\n",
" 'price',\n",
" 'ARRAY[1, tax, bath, size]'\n",
" );\n",
"\n",
"SELECT * FROM houses_linregr_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Train regression model with grouping\n",
"Next we generate three output models, one for each value of \"bedroom\"."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>method</th>\n",
" <th>source_table</th>\n",
" <th>out_table</th>\n",
" <th>dependent_varname</th>\n",
" <th>independent_varname</th>\n",
" <th>num_rows_processed</th>\n",
" <th>num_missing_rows_skipped</th>\n",
" <th>grouping_col</th>\n",
" </tr>\n",
" <tr>\n",
" <td>linregr</td>\n",
" <td>houses</td>\n",
" <td>houses_linregr_bedroom</td>\n",
" <td>price</td>\n",
" <td>ARRAY[1, tax, bath, size]</td>\n",
" <td>15</td>\n",
" <td>0</td>\n",
" <td>bedroom</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'linregr', u'houses', u'houses_linregr_bedroom', u'price', u'ARRAY[1, tax, bath, size]', 15L, 0, u'bedroom')]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"DROP TABLE IF EXISTS houses_linregr_bedroom, houses_linregr_bedroom_summary;\n",
"\n",
"SELECT madlib.linregr_train( 'houses',\n",
" 'houses_linregr_bedroom',\n",
" 'price',\n",
" 'ARRAY[1, tax, bath, size]',\n",
" 'bedroom'\n",
" );\n",
"\n",
"SELECT * FROM houses_linregr_bedroom_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Review model"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>coef</th>\n",
" <th>r2</th>\n",
" <th>std_err</th>\n",
" <th>t_stats</th>\n",
" <th>p_values</th>\n",
" <th>condition_no</th>\n",
" <th>num_rows_processed</th>\n",
" <th>num_missing_rows_skipped</th>\n",
" <th>variance_covariance</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[-12849.4168959872, 28.9613922651775, 10181.6290712649, 50.516894915353]</td>\n",
" <td>0.768577580597</td>\n",
" <td>[33453.0344331377, 15.8992104963991, 19437.7710925915, 32.9280231740856]</td>\n",
" <td>[-0.384103179688204, 1.82156166004197, 0.523806408809163, 1.53416118083608]</td>\n",
" <td>[0.708223134615411, 0.0958005827189556, 0.610804093526516, 0.153235085548177]</td>\n",
" <td>9002.5045707</td>\n",
" <td>15</td>\n",
" <td>0</td>\n",
" <td>[[1119105512.7847, 217782.067878005, -283344228.394538, -616679.693190829], [217782.067878005, 252.784894408806, -46373.1796964038, -369.864520095145], [-283344228.394538, -46373.1796964038, 377826945.047986, -209088.217319699], [-616679.693190829, -369.864520095145, -209088.217319699, 1084.25471015312]]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([-12849.4168959872, 28.9613922651775, 10181.6290712649, 50.516894915353], 0.768577580597462, [33453.0344331377, 15.8992104963991, 19437.7710925915, 32.9280231740856], [-0.384103179688204, 1.82156166004197, 0.523806408809163, 1.53416118083608], [0.708223134615411, 0.0958005827189556, 0.610804093526516, 0.153235085548177], 9002.50457069859, 15L, 0L, [[1119105512.7847, 217782.067878005, -283344228.394538, -616679.693190829], [217782.067878005, 252.784894408806, -46373.1796964038, -369.864520095145] ... (5 characters truncated) ... 83344228.394538, -46373.1796964038, 377826945.047986, -209088.217319699], [-616679.693190829, -369.864520095145, -209088.217319699, 1084.25471015312]])]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM houses_linregr;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Alternatively you can unnest the results for easier reading of output."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>attribute</th>\n",
" <th>coefficient</th>\n",
" <th>standard_error</th>\n",
" <th>t_stat</th>\n",
" <th>pvalue</th>\n",
" </tr>\n",
" <tr>\n",
" <td>intercept</td>\n",
" <td>-12849.416896</td>\n",
" <td>33453.0344331</td>\n",
" <td>-0.384103179688</td>\n",
" <td>0.708223134615</td>\n",
" </tr>\n",
" <tr>\n",
" <td>tax</td>\n",
" <td>28.9613922652</td>\n",
" <td>15.8992104964</td>\n",
" <td>1.82156166004</td>\n",
" <td>0.095800582719</td>\n",
" </tr>\n",
" <tr>\n",
" <td>bath</td>\n",
" <td>10181.6290713</td>\n",
" <td>19437.7710926</td>\n",
" <td>0.523806408809</td>\n",
" <td>0.610804093527</td>\n",
" </tr>\n",
" <tr>\n",
" <td>size</td>\n",
" <td>50.5168949154</td>\n",
" <td>32.9280231741</td>\n",
" <td>1.53416118084</td>\n",
" <td>0.153235085548</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'intercept', -12849.4168959872, 33453.0344331377, -0.384103179688204, 0.708223134615411),\n",
" (u'tax', 28.9613922651775, 15.8992104963991, 1.82156166004197, 0.0958005827189556),\n",
" (u'bath', 10181.6290712649, 19437.7710925915, 0.523806408809163, 0.610804093526516),\n",
" (u'size', 50.516894915353, 32.9280231740856, 1.53416118083608, 0.153235085548177)]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT unnest(ARRAY['intercept','tax','bath','size']) as attribute,\n",
" unnest(coef) as coefficient,\n",
" unnest(std_err) as standard_error,\n",
" unnest(t_stats) as t_stat,\n",
" unnest(p_values) as pvalue\n",
"FROM houses_linregr;"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# 5. Review model with grouping"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>bedroom</th>\n",
" <th>coef</th>\n",
" <th>r2</th>\n",
" <th>std_err</th>\n",
" <th>t_stats</th>\n",
" <th>p_values</th>\n",
" <th>condition_no</th>\n",
" <th>num_rows_processed</th>\n",
" <th>num_missing_rows_skipped</th>\n",
" <th>variance_covariance</th>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[-84242.0345406619, 55.4430144648689, -78966.9753675336, 225.611910021196]</td>\n",
" <td>0.968809546465</td>\n",
" <td>[35018.999166635, 19.5731125321026, 23036.8071292953, 49.0448678149636]</td>\n",
" <td>[-2.40560942760823, 2.83261103076655, -3.42786111479457, 4.60011251069906]</td>\n",
" <td>[0.250804617665626, 0.21605133377637, 0.180704400437667, 0.136272031474349]</td>\n",
" <td>10086.104872</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>[[1226330302.63279, -300921.595597853, 551696673.399771, -1544160.63236657], [-300921.595597853, 383.106734194352, -304863.397298569, 323.251642470093], [551696673.399771, -304863.397298569, 530694482.712349, -946345.586402424], [-1544160.63236657, 323.251642470093, -946345.586402424, 2405.39905898726]]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>[-88155.8292501588, 27.1966436294421, 41404.0293363614, 62.6375210753234]</td>\n",
" <td>0.841699901311</td>\n",
" <td>[57867.9999702679, 17.8272309154706, 43643.1321511158, 70.8506824864022]</td>\n",
" <td>[-1.52339512848988, 1.52556747362489, 0.948695185143874, 0.884077878675973]</td>\n",
" <td>[0.188161432894911, 0.187636685729916, 0.38634003237497, 0.417132778705835]</td>\n",
" <td>11722.6225642</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>[[3348705420.55893, 433697.545104307, -70253017.4577515, -2593488.13800241], [433697.545104307, 317.810162113512, -90019.0797451145, -529.274668274391], [-70253017.4577515, -90019.0797451147, 1904722983.95976, -2183233.19448568], [-2593488.13800241, -529.27466827439, -2183233.19448568, 5019.81920878898]]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>[0.0112536020318378, 41.4132554771633, 0.0225072040636757, 31.3975496688276]</td>\n",
" <td>1.0</td>\n",
" <td>[0.0, 0.0, 0.0, 0.0]</td>\n",
" <td>[inf, inf, inf, inf]</td>\n",
" <td>None</td>\n",
" <td>inf</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(2, [-84242.0345406619, 55.4430144648689, -78966.9753675336, 225.611910021196], 0.968809546465205, [35018.999166635, 19.5731125321026, 23036.8071292953, 49.0448678149636], [-2.40560942760823, 2.83261103076655, -3.42786111479457, 4.60011251069906], [0.250804617665626, 0.21605133377637, 0.180704400437667, 0.136272031474349], 10086.1048720296, 5L, 0L, [[1226330302.63279, -300921.595597853, 551696673.399771, -1544160.63236657], [-300921.595597853, 383.106734194352, -304863.397298569, 323.251642470093 ... (4 characters truncated) ... 551696673.399771, -304863.397298569, 530694482.712349, -946345.586402424], [-1544160.63236657, 323.251642470093, -946345.586402424, 2405.39905898726]]),\n",
" (3, [-88155.8292501588, 27.1966436294421, 41404.0293363614, 62.6375210753234], 0.841699901311221, [57867.9999702679, 17.8272309154706, 43643.1321511158, 70.8506824864022], [-1.52339512848988, 1.52556747362489, 0.948695185143874, 0.884077878675973], [0.188161432894911, 0.187636685729916, 0.38634003237497, 0.417132778705835], 11722.6225642065, 9L, 0L, [[3348705420.55893, 433697.545104307, -70253017.4577515, -2593488.13800241], [433697.545104307, 317.810162113512, -90019.0797451145, -529.274668274391 ... (5 characters truncated) ... 70253017.4577515, -90019.0797451147, 1904722983.95976, -2183233.19448568], [-2593488.13800241, -529.27466827439, -2183233.19448568, 5019.81920878898]]),\n",
" (4, [0.0112536020318378, 41.4132554771633, 0.0225072040636757, 31.3975496688276], 1.0, [0.0, 0.0, 0.0, 0.0], [inf, inf, inf, inf], None, inf, 1L, 0L, [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM houses_linregr_bedroom ORDER BY bedroom;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6. Prediction\n",
"Compare predicted price with actual, in the case where not grouping is used. That is, there is only one regression model for all data. (This example uses the original data table to perform the prediction. Typically a different test dataset with the same features as the original training dataset would be used for prediction.)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"15 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>tax</th>\n",
" <th>bedroom</th>\n",
" <th>bath</th>\n",
" <th>price</th>\n",
" <th>size</th>\n",
" <th>lot</th>\n",
" <th>predict</th>\n",
" <th>residual</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>590</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>50000</td>\n",
" <td>770</td>\n",
" <td>22100</td>\n",
" <td>53317.4426966</td>\n",
" <td>-3317.44269655</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1050</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>85000</td>\n",
" <td>1410</td>\n",
" <td>12000</td>\n",
" <td>109152.124956</td>\n",
" <td>-24152.1249556</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>20</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>22500</td>\n",
" <td>1060</td>\n",
" <td>3500</td>\n",
" <td>51459.3486309</td>\n",
" <td>-28959.3486309</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>870</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>90000</td>\n",
" <td>1300</td>\n",
" <td>17500</td>\n",
" <td>98382.2159072</td>\n",
" <td>-8382.21590721</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>1320</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>133000</td>\n",
" <td>1500</td>\n",
" <td>30000</td>\n",
" <td>121518.22141</td>\n",
" <td>11481.7785904</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>1350</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>90500</td>\n",
" <td>820</td>\n",
" <td>25700</td>\n",
" <td>77853.9455639</td>\n",
" <td>12646.0544361</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>2790</td>\n",
" <td>3</td>\n",
" <td>2.5</td>\n",
" <td>260000</td>\n",
" <td>2130</td>\n",
" <td>25000</td>\n",
" <td>201007.926372</td>\n",
" <td>58992.0736283</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>680</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>142500</td>\n",
" <td>1170</td>\n",
" <td>22000</td>\n",
" <td>76130.7259666</td>\n",
" <td>66369.2740334</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>1840</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>160000</td>\n",
" <td>1500</td>\n",
" <td>19000</td>\n",
" <td>136578.145387</td>\n",
" <td>23421.8546125</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>3680</td>\n",
" <td>4</td>\n",
" <td>2.0</td>\n",
" <td>240000</td>\n",
" <td>2790</td>\n",
" <td>20000</td>\n",
" <td>255033.901596</td>\n",
" <td>-15033.9015962</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>1660</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>87000</td>\n",
" <td>1030</td>\n",
" <td>17500</td>\n",
" <td>97440.5250983</td>\n",
" <td>-10440.5250983</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>1620</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>118600</td>\n",
" <td>1250</td>\n",
" <td>20000</td>\n",
" <td>117577.41536</td>\n",
" <td>1022.58463968</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>3100</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>140000</td>\n",
" <td>1760</td>\n",
" <td>38000</td>\n",
" <td>186203.89232</td>\n",
" <td>-46203.8923196</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>2070</td>\n",
" <td>2</td>\n",
" <td>3.0</td>\n",
" <td>148000</td>\n",
" <td>1550</td>\n",
" <td>14000</td>\n",
" <td>155946.739426</td>\n",
" <td>-7946.73942552</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>650</td>\n",
" <td>3</td>\n",
" <td>1.5</td>\n",
" <td>65000</td>\n",
" <td>1450</td>\n",
" <td>12000</td>\n",
" <td>94497.4293105</td>\n",
" <td>-29497.4293105</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 590, 2, 1.0, 50000, 770, 22100, 53317.4426965543, -3317.44269655428),\n",
" (2, 1050, 3, 2.0, 85000, 1410, 12000, 109152.124955627, -24152.1249556268),\n",
" (3, 20, 3, 1.0, 22500, 1060, 3500, 51459.3486308555, -28959.3486308555),\n",
" (4, 870, 2, 2.0, 90000, 1300, 17500, 98382.215907206, -8382.21590720599),\n",
" (5, 1320, 3, 2.0, 133000, 1500, 30000, 121518.221409606, 11481.7785903935),\n",
" (6, 1350, 2, 1.0, 90500, 820, 25700, 77853.9455638568, 12646.0544361432),\n",
" (7, 2790, 3, 2.5, 260000, 2130, 25000, 201007.926371722, 58992.0736282778),\n",
" (8, 680, 2, 1.0, 142500, 1170, 22000, 76130.7259665615, 66369.2740334385),\n",
" (9, 1840, 3, 2.0, 160000, 1500, 19000, 136578.145387499, 23421.8546125013),\n",
" (10, 3680, 4, 2.0, 240000, 2790, 20000, 255033.901596231, -15033.9015962306),\n",
" (11, 1660, 3, 1.0, 87000, 1030, 17500, 97440.5250982859, -10440.5250982859),\n",
" (12, 1620, 3, 2.0, 118600, 1250, 20000, 117577.415360321, 1022.58463967856),\n",
" (13, 3100, 3, 2.0, 140000, 1760, 38000, 186203.892319614, -46203.8923196141),\n",
" (14, 2070, 2, 3.0, 148000, 1550, 14000, 155946.739425522, -7946.73942552213),\n",
" (15, 650, 3, 1.5, 65000, 1450, 12000, 94497.4293105374, -29497.4293105374)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT houses.*,\n",
" madlib.linregr_predict( m.coef,\n",
" ARRAY[1,tax,bath,size]\n",
" ) as predict,\n",
" price -\n",
" madlib.linregr_predict( m.coef,\n",
" ARRAY[1,tax,bath,size] \n",
" ) as residual\n",
"FROM houses, houses_linregr m ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 7. Prediction with grouping\n",
"Compare predicted price with actual with grouping. It means a different model is used depending on the number of bedrooms."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"15 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>tax</th>\n",
" <th>bedroom</th>\n",
" <th>bath</th>\n",
" <th>price</th>\n",
" <th>size</th>\n",
" <th>lot</th>\n",
" <th>predict</th>\n",
" <th>residual</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>590</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>50000</td>\n",
" <td>770</td>\n",
" <td>22100</td>\n",
" <td>43223.5393424</td>\n",
" <td>6776.4606576</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1050</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>85000</td>\n",
" <td>1410</td>\n",
" <td>12000</td>\n",
" <td>111527.60995</td>\n",
" <td>-26527.6099497</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>20</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>22500</td>\n",
" <td>1060</td>\n",
" <td>3500</td>\n",
" <td>20187.9052986</td>\n",
" <td>2312.09470137</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>870</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>90000</td>\n",
" <td>1300</td>\n",
" <td>17500</td>\n",
" <td>99354.9203363</td>\n",
" <td>-9354.92033626</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>1320</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>133000</td>\n",
" <td>1500</td>\n",
" <td>30000</td>\n",
" <td>124508.080626</td>\n",
" <td>8491.91937359</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>1350</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>90500</td>\n",
" <td>820</td>\n",
" <td>25700</td>\n",
" <td>96640.8258368</td>\n",
" <td>-6140.82583676</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>2790</td>\n",
" <td>3</td>\n",
" <td>2.5</td>\n",
" <td>260000</td>\n",
" <td>2130</td>\n",
" <td>25000</td>\n",
" <td>224650.799707</td>\n",
" <td>35349.2002927</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>680</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>142500</td>\n",
" <td>1170</td>\n",
" <td>22000</td>\n",
" <td>138458.174653</td>\n",
" <td>4041.82534729</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>1840</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>160000</td>\n",
" <td>1500</td>\n",
" <td>19000</td>\n",
" <td>138650.335314</td>\n",
" <td>21349.6646863</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>3680</td>\n",
" <td>4</td>\n",
" <td>2.0</td>\n",
" <td>240000</td>\n",
" <td>2790</td>\n",
" <td>20000</td>\n",
" <td>240000.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>1660</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>87000</td>\n",
" <td>1030</td>\n",
" <td>17500</td>\n",
" <td>62911.2752187</td>\n",
" <td>24088.7247813</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>1620</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>118600</td>\n",
" <td>1250</td>\n",
" <td>20000</td>\n",
" <td>117007.693446</td>\n",
" <td>1592.30655359</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>3100</td>\n",
" <td>3</td>\n",
" <td>2.0</td>\n",
" <td>140000</td>\n",
" <td>1760</td>\n",
" <td>38000</td>\n",
" <td>189203.861766</td>\n",
" <td>-49203.8617664</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>2070</td>\n",
" <td>2</td>\n",
" <td>3.0</td>\n",
" <td>148000</td>\n",
" <td>1550</td>\n",
" <td>14000</td>\n",
" <td>143322.539832</td>\n",
" <td>4677.46016813</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>650</td>\n",
" <td>3</td>\n",
" <td>1.5</td>\n",
" <td>65000</td>\n",
" <td>1450</td>\n",
" <td>12000</td>\n",
" <td>82452.4386727</td>\n",
" <td>-17452.4386727</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 590, 2, 1.0, 50000, 770, 22100, 43223.5393423978, 6776.46065760222),\n",
" (2, 1050, 3, 2.0, 85000, 1410, 12000, 111527.609949684, -26527.609949684),\n",
" (3, 20, 3, 1.0, 22500, 1060, 3500, 20187.9052986341, 2312.09470136587),\n",
" (4, 870, 2, 2.0, 90000, 1300, 17500, 99354.9203362612, -9354.92033626116),\n",
" (5, 1320, 3, 2.0, 133000, 1500, 30000, 124508.080626412, 8491.91937358756),\n",
" (6, 1350, 2, 1.0, 90500, 820, 25700, 96640.8258367579, -6140.8258367579),\n",
" (7, 2790, 3, 2.5, 260000, 2130, 25000, 224650.799707327, 35349.2002926733),\n",
" (8, 680, 2, 1.0, 142500, 1170, 22000, 138458.174652714, 4041.82534728572),\n",
" (9, 1840, 3, 2.0, 160000, 1500, 19000, 138650.335313722, 21349.6646862777),\n",
" (10, 3680, 4, 2.0, 240000, 2790, 20000, 240000.0, 0.0),\n",
" (11, 1660, 3, 1.0, 87000, 1030, 17500, 62911.2752186594, 24088.7247813406),\n",
" (12, 1620, 3, 2.0, 118600, 1250, 20000, 117007.693446414, 1592.30655358579),\n",
" (13, 3100, 3, 2.0, 140000, 1760, 38000, 189203.861766403, -49203.8617664034),\n",
" (14, 2070, 2, 3.0, 148000, 1550, 14000, 143322.539831869, 4677.46016813093),\n",
" (15, 650, 3, 1.5, 65000, 1450, 12000, 82452.4386727394, -17452.4386727394)]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT houses.*,\n",
" madlib.linregr_predict( m.coef,\n",
" ARRAY[1,tax,bath,size]\n",
" ) as predict,\n",
" price -\n",
" madlib.linregr_predict( m.coef,\n",
" ARRAY[1,tax,bath,size] \n",
" ) as residual\n",
"FROM houses, houses_linregr_bedroom m\n",
"WHERE houses.bedroom = m.bedroom\n",
"ORDER BY id;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}