community-artifacts/Supervised-learning/Linear-regression-v1.ipynb (1,166 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Linear regression" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext sql" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Greenplum Database 5.x on GCP (PM demo machine) - via tunnel\n", "%sql postgresql://gpadmin@localhost:8000/madlib\n", " \n", "# PostgreSQL local\n", "#%sql postgresql://fmcquillan@localhost:5432/madlib" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>version</th>\n", " </tr>\n", " <tr>\n", " <td>MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-100-g4987e8f, cmake configuration time: Wed Mar 24 23:51:47 UTC 2021, build type: release, build system: Linux-3.10.0-1160.21.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-100-g4987e8f, cmake configuration time: Wed Mar 24 23:51:47 UTC 2021, build type: release, build system: Linux-3.10.0-1160.21.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5',)]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql select madlib.version();" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Load test data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "15 rows affected.\n", "15 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>tax</th>\n", " <th>bedroom</th>\n", " <th>bath</th>\n", " <th>price</th>\n", " <th>size</th>\n", " <th>lot</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>590</td>\n", " <td>2</td>\n", " <td>1.0</td>\n", " <td>50000</td>\n", " <td>770</td>\n", " <td>22100</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>1050</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>85000</td>\n", " <td>1410</td>\n", " <td>12000</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>20</td>\n", " <td>3</td>\n", " <td>1.0</td>\n", " <td>22500</td>\n", " <td>1060</td>\n", " <td>3500</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>870</td>\n", " <td>2</td>\n", " <td>2.0</td>\n", " <td>90000</td>\n", " <td>1300</td>\n", " <td>17500</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>1320</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>133000</td>\n", " <td>1500</td>\n", " <td>30000</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>1350</td>\n", " <td>2</td>\n", " <td>1.0</td>\n", " <td>90500</td>\n", " <td>820</td>\n", " <td>25700</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>2790</td>\n", " <td>3</td>\n", " <td>2.5</td>\n", " <td>260000</td>\n", " <td>2130</td>\n", " <td>25000</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>680</td>\n", " <td>2</td>\n", " <td>1.0</td>\n", " <td>142500</td>\n", " <td>1170</td>\n", " <td>22000</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>1840</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>160000</td>\n", " <td>1500</td>\n", " <td>19000</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>3680</td>\n", " <td>4</td>\n", " <td>2.0</td>\n", " <td>240000</td>\n", " <td>2790</td>\n", " <td>20000</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>1660</td>\n", " <td>3</td>\n", " <td>1.0</td>\n", " <td>87000</td>\n", " <td>1030</td>\n", " <td>17500</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>1620</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>118600</td>\n", " <td>1250</td>\n", " <td>20000</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>3100</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>140000</td>\n", " <td>1760</td>\n", " <td>38000</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>2070</td>\n", " <td>2</td>\n", " <td>3.0</td>\n", " <td>148000</td>\n", " <td>1550</td>\n", " <td>14000</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>650</td>\n", " <td>3</td>\n", " <td>1.5</td>\n", " <td>65000</td>\n", " <td>1450</td>\n", " <td>12000</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, 590, 2, 1.0, 50000, 770, 22100),\n", " (2, 1050, 3, 2.0, 85000, 1410, 12000),\n", " (3, 20, 3, 1.0, 22500, 1060, 3500),\n", " (4, 870, 2, 2.0, 90000, 1300, 17500),\n", " (5, 1320, 3, 2.0, 133000, 1500, 30000),\n", " (6, 1350, 2, 1.0, 90500, 820, 25700),\n", " (7, 2790, 3, 2.5, 260000, 2130, 25000),\n", " (8, 680, 2, 1.0, 142500, 1170, 22000),\n", " (9, 1840, 3, 2.0, 160000, 1500, 19000),\n", " (10, 3680, 4, 2.0, 240000, 2790, 20000),\n", " (11, 1660, 3, 1.0, 87000, 1030, 17500),\n", " (12, 1620, 3, 2.0, 118600, 1250, 20000),\n", " (13, 3100, 3, 2.0, 140000, 1760, 38000),\n", " (14, 2070, 2, 3.0, 148000, 1550, 14000),\n", " (15, 650, 3, 1.5, 65000, 1450, 12000)]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql \n", "DROP TABLE IF EXISTS houses;\n", "\n", "CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,\n", " size INT, lot INT);\n", "\n", "INSERT INTO houses VALUES \n", " (1 , 590 , 2 , 1 , 50000 , 770 , 22100),\n", " (2 , 1050 , 3 , 2 , 85000 , 1410 , 12000),\n", " (3 , 20 , 3 , 1 , 22500 , 1060 , 3500),\n", " (4 , 870 , 2 , 2 , 90000 , 1300 , 17500),\n", " (5 , 1320 , 3 , 2 , 133000 , 1500 , 30000),\n", " (6 , 1350 , 2 , 1 , 90500 , 820 , 25700),\n", " (7 , 2790 , 3 , 2.5 , 260000 , 2130 , 25000),\n", " (8 , 680 , 2 , 1 , 142500 , 1170 , 22000),\n", " (9 , 1840 , 3 , 2 , 160000 , 1500 , 19000),\n", " (10 , 3680 , 4 , 2 , 240000 , 2790 , 20000),\n", " (11 , 1660 , 3 , 1 , 87000 , 1030 , 17500),\n", " (12 , 1620 , 3 , 2 , 118600 , 1250 , 20000),\n", " (13 , 3100 , 3 , 2 , 140000 , 1760 , 38000),\n", " (14 , 2070 , 2 , 3 , 148000 , 1550 , 14000),\n", " (15 , 650 , 3 , 1.5 , 65000 , 1450 , 12000);\n", " \n", "SELECT * FROM houses ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Train regression model\n", "First, we generate a single regression for all data." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>method</th>\n", " <th>source_table</th>\n", " <th>out_table</th>\n", " <th>dependent_varname</th>\n", " <th>independent_varname</th>\n", " <th>num_rows_processed</th>\n", " <th>num_missing_rows_skipped</th>\n", " <th>grouping_col</th>\n", " </tr>\n", " <tr>\n", " <td>linregr</td>\n", " <td>houses</td>\n", " <td>houses_linregr</td>\n", " <td>price</td>\n", " <td>ARRAY[1, tax, bath, size]</td>\n", " <td>15</td>\n", " <td>0</td>\n", " <td>None</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'linregr', u'houses', u'houses_linregr', u'price', u'ARRAY[1, tax, bath, size]', 15L, 0, None)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS houses_linregr, houses_linregr_summary;\n", "\n", "SELECT madlib.linregr_train( 'houses',\n", " 'houses_linregr',\n", " 'price',\n", " 'ARRAY[1, tax, bath, size]'\n", " );\n", "\n", "SELECT * FROM houses_linregr_summary;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Train regression model with grouping\n", "Next we generate three output models, one for each value of \"bedroom\"." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>method</th>\n", " <th>source_table</th>\n", " <th>out_table</th>\n", " <th>dependent_varname</th>\n", " <th>independent_varname</th>\n", " <th>num_rows_processed</th>\n", " <th>num_missing_rows_skipped</th>\n", " <th>grouping_col</th>\n", " </tr>\n", " <tr>\n", " <td>linregr</td>\n", " <td>houses</td>\n", " <td>houses_linregr_bedroom</td>\n", " <td>price</td>\n", " <td>ARRAY[1, tax, bath, size]</td>\n", " <td>15</td>\n", " <td>0</td>\n", " <td>bedroom</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'linregr', u'houses', u'houses_linregr_bedroom', u'price', u'ARRAY[1, tax, bath, size]', 15L, 0, u'bedroom')]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql \n", "DROP TABLE IF EXISTS houses_linregr_bedroom, houses_linregr_bedroom_summary;\n", "\n", "SELECT madlib.linregr_train( 'houses',\n", " 'houses_linregr_bedroom',\n", " 'price',\n", " 'ARRAY[1, tax, bath, size]',\n", " 'bedroom'\n", " );\n", "\n", "SELECT * FROM houses_linregr_bedroom_summary;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Review model" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>coef</th>\n", " <th>r2</th>\n", " <th>std_err</th>\n", " <th>t_stats</th>\n", " <th>p_values</th>\n", " <th>condition_no</th>\n", " <th>num_rows_processed</th>\n", " <th>num_missing_rows_skipped</th>\n", " <th>variance_covariance</th>\n", " </tr>\n", " <tr>\n", " <td>[-12849.4168959872, 28.9613922651775, 10181.6290712649, 50.516894915353]</td>\n", " <td>0.768577580597</td>\n", " <td>[33453.0344331377, 15.8992104963991, 19437.7710925915, 32.9280231740856]</td>\n", " <td>[-0.384103179688204, 1.82156166004197, 0.523806408809163, 1.53416118083608]</td>\n", " <td>[0.708223134615411, 0.0958005827189556, 0.610804093526516, 0.153235085548177]</td>\n", " <td>9002.5045707</td>\n", " <td>15</td>\n", " <td>0</td>\n", " <td>[[1119105512.7847, 217782.067878005, -283344228.394538, -616679.693190829], [217782.067878005, 252.784894408806, -46373.1796964038, -369.864520095145], [-283344228.394538, -46373.1796964038, 377826945.047986, -209088.217319699], [-616679.693190829, -369.864520095145, -209088.217319699, 1084.25471015312]]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[([-12849.4168959872, 28.9613922651775, 10181.6290712649, 50.516894915353], 0.768577580597462, [33453.0344331377, 15.8992104963991, 19437.7710925915, 32.9280231740856], [-0.384103179688204, 1.82156166004197, 0.523806408809163, 1.53416118083608], [0.708223134615411, 0.0958005827189556, 0.610804093526516, 0.153235085548177], 9002.50457069859, 15L, 0L, [[1119105512.7847, 217782.067878005, -283344228.394538, -616679.693190829], [217782.067878005, 252.784894408806, -46373.1796964038, -369.864520095145] ... (5 characters truncated) ... 83344228.394538, -46373.1796964038, 377826945.047986, -209088.217319699], [-616679.693190829, -369.864520095145, -209088.217319699, 1084.25471015312]])]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM houses_linregr;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively you can unnest the results for easier reading of output." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>attribute</th>\n", " <th>coefficient</th>\n", " <th>standard_error</th>\n", " <th>t_stat</th>\n", " <th>pvalue</th>\n", " </tr>\n", " <tr>\n", " <td>intercept</td>\n", " <td>-12849.416896</td>\n", " <td>33453.0344331</td>\n", " <td>-0.384103179688</td>\n", " <td>0.708223134615</td>\n", " </tr>\n", " <tr>\n", " <td>tax</td>\n", " <td>28.9613922652</td>\n", " <td>15.8992104964</td>\n", " <td>1.82156166004</td>\n", " <td>0.095800582719</td>\n", " </tr>\n", " <tr>\n", " <td>bath</td>\n", " <td>10181.6290713</td>\n", " <td>19437.7710926</td>\n", " <td>0.523806408809</td>\n", " <td>0.610804093527</td>\n", " </tr>\n", " <tr>\n", " <td>size</td>\n", " <td>50.5168949154</td>\n", " <td>32.9280231741</td>\n", " <td>1.53416118084</td>\n", " <td>0.153235085548</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'intercept', -12849.4168959872, 33453.0344331377, -0.384103179688204, 0.708223134615411),\n", " (u'tax', 28.9613922651775, 15.8992104963991, 1.82156166004197, 0.0958005827189556),\n", " (u'bath', 10181.6290712649, 19437.7710925915, 0.523806408809163, 0.610804093526516),\n", " (u'size', 50.516894915353, 32.9280231740856, 1.53416118083608, 0.153235085548177)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT unnest(ARRAY['intercept','tax','bath','size']) as attribute,\n", " unnest(coef) as coefficient,\n", " unnest(std_err) as standard_error,\n", " unnest(t_stats) as t_stat,\n", " unnest(p_values) as pvalue\n", "FROM houses_linregr;" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# 5. Review model with grouping" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>bedroom</th>\n", " <th>coef</th>\n", " <th>r2</th>\n", " <th>std_err</th>\n", " <th>t_stats</th>\n", " <th>p_values</th>\n", " <th>condition_no</th>\n", " <th>num_rows_processed</th>\n", " <th>num_missing_rows_skipped</th>\n", " <th>variance_covariance</th>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[-84242.0345406619, 55.4430144648689, -78966.9753675336, 225.611910021196]</td>\n", " <td>0.968809546465</td>\n", " <td>[35018.999166635, 19.5731125321026, 23036.8071292953, 49.0448678149636]</td>\n", " <td>[-2.40560942760823, 2.83261103076655, -3.42786111479457, 4.60011251069906]</td>\n", " <td>[0.250804617665626, 0.21605133377637, 0.180704400437667, 0.136272031474349]</td>\n", " <td>10086.104872</td>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>[[1226330302.63279, -300921.595597853, 551696673.399771, -1544160.63236657], [-300921.595597853, 383.106734194352, -304863.397298569, 323.251642470093], [551696673.399771, -304863.397298569, 530694482.712349, -946345.586402424], [-1544160.63236657, 323.251642470093, -946345.586402424, 2405.39905898726]]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[-88155.8292501588, 27.1966436294421, 41404.0293363614, 62.6375210753234]</td>\n", " <td>0.841699901311</td>\n", " <td>[57867.9999702679, 17.8272309154706, 43643.1321511158, 70.8506824864022]</td>\n", " <td>[-1.52339512848988, 1.52556747362489, 0.948695185143874, 0.884077878675973]</td>\n", " <td>[0.188161432894911, 0.187636685729916, 0.38634003237497, 0.417132778705835]</td>\n", " <td>11722.6225642</td>\n", " <td>9</td>\n", " <td>0</td>\n", " <td>[[3348705420.55893, 433697.545104307, -70253017.4577515, -2593488.13800241], [433697.545104307, 317.810162113512, -90019.0797451145, -529.274668274391], [-70253017.4577515, -90019.0797451147, 1904722983.95976, -2183233.19448568], [-2593488.13800241, -529.27466827439, -2183233.19448568, 5019.81920878898]]</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>[0.0112536020318378, 41.4132554771633, 0.0225072040636757, 31.3975496688276]</td>\n", " <td>1.0</td>\n", " <td>[0.0, 0.0, 0.0, 0.0]</td>\n", " <td>[inf, inf, inf, inf]</td>\n", " <td>None</td>\n", " <td>inf</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(2, [-84242.0345406619, 55.4430144648689, -78966.9753675336, 225.611910021196], 0.968809546465205, [35018.999166635, 19.5731125321026, 23036.8071292953, 49.0448678149636], [-2.40560942760823, 2.83261103076655, -3.42786111479457, 4.60011251069906], [0.250804617665626, 0.21605133377637, 0.180704400437667, 0.136272031474349], 10086.1048720296, 5L, 0L, [[1226330302.63279, -300921.595597853, 551696673.399771, -1544160.63236657], [-300921.595597853, 383.106734194352, -304863.397298569, 323.251642470093 ... (4 characters truncated) ... 551696673.399771, -304863.397298569, 530694482.712349, -946345.586402424], [-1544160.63236657, 323.251642470093, -946345.586402424, 2405.39905898726]]),\n", " (3, [-88155.8292501588, 27.1966436294421, 41404.0293363614, 62.6375210753234], 0.841699901311221, [57867.9999702679, 17.8272309154706, 43643.1321511158, 70.8506824864022], [-1.52339512848988, 1.52556747362489, 0.948695185143874, 0.884077878675973], [0.188161432894911, 0.187636685729916, 0.38634003237497, 0.417132778705835], 11722.6225642065, 9L, 0L, [[3348705420.55893, 433697.545104307, -70253017.4577515, -2593488.13800241], [433697.545104307, 317.810162113512, -90019.0797451145, -529.274668274391 ... (5 characters truncated) ... 70253017.4577515, -90019.0797451147, 1904722983.95976, -2183233.19448568], [-2593488.13800241, -529.27466827439, -2183233.19448568, 5019.81920878898]]),\n", " (4, [0.0112536020318378, 41.4132554771633, 0.0225072040636757, 31.3975496688276], 1.0, [0.0, 0.0, 0.0, 0.0], [inf, inf, inf, inf], None, inf, 1L, 0L, [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM houses_linregr_bedroom ORDER BY bedroom;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Prediction\n", "Compare predicted price with actual, in the case where not grouping is used. That is, there is only one regression model for all data. (This example uses the original data table to perform the prediction. Typically a different test dataset with the same features as the original training dataset would be used for prediction.)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "15 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>tax</th>\n", " <th>bedroom</th>\n", " <th>bath</th>\n", " <th>price</th>\n", " <th>size</th>\n", " <th>lot</th>\n", " <th>predict</th>\n", " <th>residual</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>590</td>\n", " <td>2</td>\n", " <td>1.0</td>\n", " <td>50000</td>\n", " <td>770</td>\n", " <td>22100</td>\n", " <td>53317.4426966</td>\n", " <td>-3317.44269655</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>1050</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>85000</td>\n", " <td>1410</td>\n", " <td>12000</td>\n", " <td>109152.124956</td>\n", " <td>-24152.1249556</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>20</td>\n", " <td>3</td>\n", " <td>1.0</td>\n", " <td>22500</td>\n", " <td>1060</td>\n", " <td>3500</td>\n", " <td>51459.3486309</td>\n", " <td>-28959.3486309</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>870</td>\n", " <td>2</td>\n", " <td>2.0</td>\n", " <td>90000</td>\n", " <td>1300</td>\n", " <td>17500</td>\n", " <td>98382.2159072</td>\n", " <td>-8382.21590721</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>1320</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>133000</td>\n", " <td>1500</td>\n", " <td>30000</td>\n", " <td>121518.22141</td>\n", " <td>11481.7785904</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>1350</td>\n", " <td>2</td>\n", " <td>1.0</td>\n", " <td>90500</td>\n", " <td>820</td>\n", " <td>25700</td>\n", " <td>77853.9455639</td>\n", " <td>12646.0544361</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>2790</td>\n", " <td>3</td>\n", " <td>2.5</td>\n", " <td>260000</td>\n", " <td>2130</td>\n", " <td>25000</td>\n", " <td>201007.926372</td>\n", " <td>58992.0736283</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>680</td>\n", " <td>2</td>\n", " <td>1.0</td>\n", " <td>142500</td>\n", " <td>1170</td>\n", " <td>22000</td>\n", " <td>76130.7259666</td>\n", " <td>66369.2740334</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>1840</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>160000</td>\n", " <td>1500</td>\n", " <td>19000</td>\n", " <td>136578.145387</td>\n", " <td>23421.8546125</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>3680</td>\n", " <td>4</td>\n", " <td>2.0</td>\n", " <td>240000</td>\n", " <td>2790</td>\n", " <td>20000</td>\n", " <td>255033.901596</td>\n", " <td>-15033.9015962</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>1660</td>\n", " <td>3</td>\n", " <td>1.0</td>\n", " <td>87000</td>\n", " <td>1030</td>\n", " <td>17500</td>\n", " <td>97440.5250983</td>\n", " <td>-10440.5250983</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>1620</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>118600</td>\n", " <td>1250</td>\n", " <td>20000</td>\n", " <td>117577.41536</td>\n", " <td>1022.58463968</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>3100</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>140000</td>\n", " <td>1760</td>\n", " <td>38000</td>\n", " <td>186203.89232</td>\n", " <td>-46203.8923196</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>2070</td>\n", " <td>2</td>\n", " <td>3.0</td>\n", " <td>148000</td>\n", " <td>1550</td>\n", " <td>14000</td>\n", " <td>155946.739426</td>\n", " <td>-7946.73942552</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>650</td>\n", " <td>3</td>\n", " <td>1.5</td>\n", " <td>65000</td>\n", " <td>1450</td>\n", " <td>12000</td>\n", " <td>94497.4293105</td>\n", " <td>-29497.4293105</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, 590, 2, 1.0, 50000, 770, 22100, 53317.4426965543, -3317.44269655428),\n", " (2, 1050, 3, 2.0, 85000, 1410, 12000, 109152.124955627, -24152.1249556268),\n", " (3, 20, 3, 1.0, 22500, 1060, 3500, 51459.3486308555, -28959.3486308555),\n", " (4, 870, 2, 2.0, 90000, 1300, 17500, 98382.215907206, -8382.21590720599),\n", " (5, 1320, 3, 2.0, 133000, 1500, 30000, 121518.221409606, 11481.7785903935),\n", " (6, 1350, 2, 1.0, 90500, 820, 25700, 77853.9455638568, 12646.0544361432),\n", " (7, 2790, 3, 2.5, 260000, 2130, 25000, 201007.926371722, 58992.0736282778),\n", " (8, 680, 2, 1.0, 142500, 1170, 22000, 76130.7259665615, 66369.2740334385),\n", " (9, 1840, 3, 2.0, 160000, 1500, 19000, 136578.145387499, 23421.8546125013),\n", " (10, 3680, 4, 2.0, 240000, 2790, 20000, 255033.901596231, -15033.9015962306),\n", " (11, 1660, 3, 1.0, 87000, 1030, 17500, 97440.5250982859, -10440.5250982859),\n", " (12, 1620, 3, 2.0, 118600, 1250, 20000, 117577.415360321, 1022.58463967856),\n", " (13, 3100, 3, 2.0, 140000, 1760, 38000, 186203.892319614, -46203.8923196141),\n", " (14, 2070, 2, 3.0, 148000, 1550, 14000, 155946.739425522, -7946.73942552213),\n", " (15, 650, 3, 1.5, 65000, 1450, 12000, 94497.4293105374, -29497.4293105374)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT houses.*,\n", " madlib.linregr_predict( m.coef,\n", " ARRAY[1,tax,bath,size]\n", " ) as predict,\n", " price -\n", " madlib.linregr_predict( m.coef,\n", " ARRAY[1,tax,bath,size] \n", " ) as residual\n", "FROM houses, houses_linregr m ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7. Prediction with grouping\n", "Compare predicted price with actual with grouping. It means a different model is used depending on the number of bedrooms." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "15 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>tax</th>\n", " <th>bedroom</th>\n", " <th>bath</th>\n", " <th>price</th>\n", " <th>size</th>\n", " <th>lot</th>\n", " <th>predict</th>\n", " <th>residual</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>590</td>\n", " <td>2</td>\n", " <td>1.0</td>\n", " <td>50000</td>\n", " <td>770</td>\n", " <td>22100</td>\n", " <td>43223.5393424</td>\n", " <td>6776.4606576</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>1050</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>85000</td>\n", " <td>1410</td>\n", " <td>12000</td>\n", " <td>111527.60995</td>\n", " <td>-26527.6099497</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>20</td>\n", " <td>3</td>\n", " <td>1.0</td>\n", " <td>22500</td>\n", " <td>1060</td>\n", " <td>3500</td>\n", " <td>20187.9052986</td>\n", " <td>2312.09470137</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>870</td>\n", " <td>2</td>\n", " <td>2.0</td>\n", " <td>90000</td>\n", " <td>1300</td>\n", " <td>17500</td>\n", " <td>99354.9203363</td>\n", " <td>-9354.92033626</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>1320</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>133000</td>\n", " <td>1500</td>\n", " <td>30000</td>\n", " <td>124508.080626</td>\n", " <td>8491.91937359</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>1350</td>\n", " <td>2</td>\n", " <td>1.0</td>\n", " <td>90500</td>\n", " <td>820</td>\n", " <td>25700</td>\n", " <td>96640.8258368</td>\n", " <td>-6140.82583676</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>2790</td>\n", " <td>3</td>\n", " <td>2.5</td>\n", " <td>260000</td>\n", " <td>2130</td>\n", " <td>25000</td>\n", " <td>224650.799707</td>\n", " <td>35349.2002927</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>680</td>\n", " <td>2</td>\n", " <td>1.0</td>\n", " <td>142500</td>\n", " <td>1170</td>\n", " <td>22000</td>\n", " <td>138458.174653</td>\n", " <td>4041.82534729</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>1840</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>160000</td>\n", " <td>1500</td>\n", " <td>19000</td>\n", " <td>138650.335314</td>\n", " <td>21349.6646863</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>3680</td>\n", " <td>4</td>\n", " <td>2.0</td>\n", " <td>240000</td>\n", " <td>2790</td>\n", " <td>20000</td>\n", " <td>240000.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>1660</td>\n", " <td>3</td>\n", " <td>1.0</td>\n", " <td>87000</td>\n", " <td>1030</td>\n", " <td>17500</td>\n", " <td>62911.2752187</td>\n", " <td>24088.7247813</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>1620</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>118600</td>\n", " <td>1250</td>\n", " <td>20000</td>\n", " <td>117007.693446</td>\n", " <td>1592.30655359</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>3100</td>\n", " <td>3</td>\n", " <td>2.0</td>\n", " <td>140000</td>\n", " <td>1760</td>\n", " <td>38000</td>\n", " <td>189203.861766</td>\n", " <td>-49203.8617664</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>2070</td>\n", " <td>2</td>\n", " <td>3.0</td>\n", " <td>148000</td>\n", " <td>1550</td>\n", " <td>14000</td>\n", " <td>143322.539832</td>\n", " <td>4677.46016813</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>650</td>\n", " <td>3</td>\n", " <td>1.5</td>\n", " <td>65000</td>\n", " <td>1450</td>\n", " <td>12000</td>\n", " <td>82452.4386727</td>\n", " <td>-17452.4386727</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, 590, 2, 1.0, 50000, 770, 22100, 43223.5393423978, 6776.46065760222),\n", " (2, 1050, 3, 2.0, 85000, 1410, 12000, 111527.609949684, -26527.609949684),\n", " (3, 20, 3, 1.0, 22500, 1060, 3500, 20187.9052986341, 2312.09470136587),\n", " (4, 870, 2, 2.0, 90000, 1300, 17500, 99354.9203362612, -9354.92033626116),\n", " (5, 1320, 3, 2.0, 133000, 1500, 30000, 124508.080626412, 8491.91937358756),\n", " (6, 1350, 2, 1.0, 90500, 820, 25700, 96640.8258367579, -6140.8258367579),\n", " (7, 2790, 3, 2.5, 260000, 2130, 25000, 224650.799707327, 35349.2002926733),\n", " (8, 680, 2, 1.0, 142500, 1170, 22000, 138458.174652714, 4041.82534728572),\n", " (9, 1840, 3, 2.0, 160000, 1500, 19000, 138650.335313722, 21349.6646862777),\n", " (10, 3680, 4, 2.0, 240000, 2790, 20000, 240000.0, 0.0),\n", " (11, 1660, 3, 1.0, 87000, 1030, 17500, 62911.2752186594, 24088.7247813406),\n", " (12, 1620, 3, 2.0, 118600, 1250, 20000, 117007.693446414, 1592.30655358579),\n", " (13, 3100, 3, 2.0, 140000, 1760, 38000, 189203.861766403, -49203.8617664034),\n", " (14, 2070, 2, 3.0, 148000, 1550, 14000, 143322.539831869, 4677.46016813093),\n", " (15, 650, 3, 1.5, 65000, 1450, 12000, 82452.4386727394, -17452.4386727394)]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT houses.*,\n", " madlib.linregr_predict( m.coef,\n", " ARRAY[1,tax,bath,size]\n", " ) as predict,\n", " price -\n", " madlib.linregr_predict( m.coef,\n", " ARRAY[1,tax,bath,size] \n", " ) as residual\n", "FROM houses, houses_linregr_bedroom m\n", "WHERE houses.bedroom = m.bedroom\n", "ORDER BY id;" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.16" } }, "nbformat": 4, "nbformat_minor": 1 }