community-artifacts/Supervised-learning/Logistic-regression-v1.ipynb (892 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Logistic regression" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", " \"You should import from traitlets.config instead.\", ShimWarning)\n", "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" ] } ], "source": [ "%load_ext sql" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "u'Connected: fmcquillan@madlib'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Greenplum 4.3.10.0\n", "# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", " \n", "# PostgreSQL local\n", "%sql postgresql://fmcquillan@localhost:5432/madlib" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>version</th>\n", " </tr>\n", " <tr>\n", " <td>MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang',)]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql select madlib.version();\n", "#%sql select version();" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Load data\n", "This data set is related to predicting a second heart attack given treatment and health factors." ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "20 rows affected.\n", "20 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>second_attack</th>\n", " <th>treatment</th>\n", " <th>trait_anxiety</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>70</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>80</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>50</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>60</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>40</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>65</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>75</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>80</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>70</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>60</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>65</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>50</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>45</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>35</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>40</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>50</td>\n", " </tr>\n", " <tr>\n", " <td>17</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>55</td>\n", " </tr>\n", " <tr>\n", " <td>18</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>45</td>\n", " </tr>\n", " <tr>\n", " <td>19</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>50</td>\n", " </tr>\n", " <tr>\n", " <td>20</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>60</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, 1, 1, 70),\n", " (2, 1, 1, 80),\n", " (3, 1, 1, 50),\n", " (4, 1, 0, 60),\n", " (5, 1, 0, 40),\n", " (6, 1, 0, 65),\n", " (7, 1, 0, 75),\n", " (8, 1, 0, 80),\n", " (9, 1, 0, 70),\n", " (10, 1, 0, 60),\n", " (11, 0, 1, 65),\n", " (12, 0, 1, 50),\n", " (13, 0, 1, 45),\n", " (14, 0, 1, 35),\n", " (15, 0, 1, 40),\n", " (16, 0, 1, 50),\n", " (17, 0, 0, 55),\n", " (18, 0, 0, 45),\n", " (19, 0, 0, 50),\n", " (20, 0, 0, 60)]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS patients;\n", "\n", "CREATE TABLE patients( id INTEGER NOT NULL,\n", " second_attack INTEGER,\n", " treatment INTEGER,\n", " trait_anxiety INTEGER);\n", " \n", "INSERT INTO patients VALUES \n", "(1, 1, 1, 70),\n", "(2, 1, 1, 80),\n", "(3, 1, 1, 50),\n", "(4, 1, 0, 60),\n", "(5, 1, 0, 40),\n", "(6, 1, 0, 65),\n", "(7, 1, 0, 75),\n", "(8, 1, 0, 80),\n", "(9, 1, 0, 70),\n", "(10, 1, 0, 60),\n", "(11, 0, 1, 65),\n", "(12, 0, 1, 50),\n", "(13, 0, 1, 45),\n", "(14, 0, 1, 35),\n", "(15, 0, 1, 40),\n", "(16, 0, 1, 50),\n", "(17, 0, 0, 55),\n", "(18, 0, 0, 45),\n", "(19, 0, 0, 50),\n", "(20, 0, 0, 60);\n", "\n", "SELECT * FROM patients ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Train regression model" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>method</th>\n", " <th>source_table</th>\n", " <th>out_table</th>\n", " <th>dependent_varname</th>\n", " <th>independent_varname</th>\n", " <th>optimizer_params</th>\n", " <th>num_all_groups</th>\n", " <th>num_failed_groups</th>\n", " <th>num_rows_processed</th>\n", " <th>num_missing_rows_skipped</th>\n", " <th>grouping_col</th>\n", " </tr>\n", " <tr>\n", " <td>logregr</td>\n", " <td>patients</td>\n", " <td>patients_logregr</td>\n", " <td>second_attack</td>\n", " <td>ARRAY[1, treatment, trait_anxiety]</td>\n", " <td>optimizer=irls, max_iter=20, tolerance=0.0001</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>20</td>\n", " <td>0</td>\n", " <td>None</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'logregr', u'patients', u'patients_logregr', u'second_attack', u'ARRAY[1, treatment, trait_anxiety]', u'optimizer=irls, max_iter=20, tolerance=0.0001', 1, 0, 20, 0, None)]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS patients_logregr, patients_logregr_summary;\n", "\n", "SELECT madlib.logregr_train( 'patients', -- Source table\n", " 'patients_logregr', -- Output table\n", " 'second_attack', -- Dependent variable\n", " 'ARRAY[1, treatment, trait_anxiety]', -- Feature vector\n", " NULL, -- Grouping\n", " 20, -- Max iterations\n", " 'irls' -- Optimizer to use\n", " );\n", "\n", "SELECT * FROM patients_logregr_summary;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. View regression results" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>coef</th>\n", " <th>log_likelihood</th>\n", " <th>std_err</th>\n", " <th>z_stats</th>\n", " <th>p_values</th>\n", " <th>odds_ratios</th>\n", " <th>condition_no</th>\n", " <th>num_rows_processed</th>\n", " <th>num_missing_rows_skipped</th>\n", " <th>num_iterations</th>\n", " <th>variance_covariance</th>\n", " </tr>\n", " <tr>\n", " <td>[-6.36346994178192, -1.02410605239327, 0.119044916668607]</td>\n", " <td>-9.41018298389</td>\n", " <td>[3.21389766375099, 1.17107844860319, 0.0549790458269317]</td>\n", " <td>[-1.97998524145757, -0.874498248699539, 2.16527796868916]</td>\n", " <td>[0.0477051870698145, 0.381846973530455, 0.0303664045046183]</td>\n", " <td>[0.00172337630923221, 0.359117354054956, 1.12642051220895]</td>\n", " <td>326.081922792</td>\n", " <td>20</td>\n", " <td>0</td>\n", " <td>5</td>\n", " <td>[[10.329138193064, -0.474304665195738, -0.171995901260057], [-0.474304665195738, 1.37142473278286, -0.00119520703381591], [-0.171995901260057, -0.00119520703381591, 0.00302269548003986]]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[([-6.36346994178192, -1.02410605239327, 0.119044916668607], -9.41018298388876, [3.21389766375099, 1.17107844860319, 0.0549790458269317], [-1.97998524145757, -0.874498248699539, 2.16527796868916], [0.0477051870698145, 0.381846973530455, 0.0303664045046183], [0.00172337630923221, 0.359117354054956, 1.12642051220895], 326.081922791575, 20L, 0L, 5, [[10.329138193064, -0.474304665195738, -0.171995901260057], [-0.474304665195738, 1.37142473278286, -0.00119520703381591], [-0.171995901260057, -0.00119520703381591, 0.00302269548003986]])]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * from patients_logregr;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Output formatting\n", "Alternatively, unnest the arrays in the results for easier reading of output:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>attribute</th>\n", " <th>coefficient</th>\n", " <th>standard_error</th>\n", " <th>z_stat</th>\n", " <th>pvalue</th>\n", " <th>odds_ratio</th>\n", " </tr>\n", " <tr>\n", " <td>intercept</td>\n", " <td>-6.36346994178</td>\n", " <td>3.21389766375</td>\n", " <td>-1.97998524146</td>\n", " <td>0.0477051870698</td>\n", " <td>0.00172337630923</td>\n", " </tr>\n", " <tr>\n", " <td>treatment</td>\n", " <td>-1.02410605239</td>\n", " <td>1.1710784486</td>\n", " <td>-0.8744982487</td>\n", " <td>0.38184697353</td>\n", " <td>0.359117354055</td>\n", " </tr>\n", " <tr>\n", " <td>trait_anxiety</td>\n", " <td>0.119044916669</td>\n", " <td>0.0549790458269</td>\n", " <td>2.16527796869</td>\n", " <td>0.0303664045046</td>\n", " <td>1.12642051221</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'intercept', -6.36346994178192, 3.21389766375099, -1.97998524145757, 0.0477051870698145, 0.00172337630923221),\n", " (u'treatment', -1.02410605239327, 1.17107844860319, -0.874498248699539, 0.381846973530455, 0.359117354054956),\n", " (u'trait_anxiety', 0.119044916668607, 0.0549790458269317, 2.16527796868916, 0.0303664045046183, 1.12642051220895)]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT unnest(array['intercept', 'treatment', 'trait_anxiety']) as attribute,\n", " unnest(coef) as coefficient,\n", " unnest(std_err) as standard_error,\n", " unnest(z_stats) as z_stat,\n", " unnest(p_values) as pvalue,\n", " unnest(odds_ratios) as odds_ratio\n", " FROM patients_logregr;" ] }, { "cell_type": "markdown", "metadata": { "scrolled": true }, "source": [ "# 5. Predict outcome\n", "This example uses the original data table to perform the prediction. Typically a different test dataset with the same features as the original training dataset would be used for prediction." ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>logregr_predict</th>\n", " <th>second_attack</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>False</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>False</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>True</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>17</td>\n", " <td>True</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>18</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>19</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>20</td>\n", " <td>True</td>\n", " <td>False</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, True, True),\n", " (2, True, True),\n", " (3, False, True),\n", " (4, True, True),\n", " (5, False, True),\n", " (6, True, True),\n", " (7, True, True),\n", " (8, True, True),\n", " (9, True, True),\n", " (10, True, True),\n", " (11, True, False),\n", " (12, False, False),\n", " (13, False, False),\n", " (14, False, False),\n", " (15, False, False),\n", " (16, False, False),\n", " (17, True, False),\n", " (18, False, False),\n", " (19, False, False),\n", " (20, True, False)]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "-- Display prediction value along with the original value\n", "SELECT p.id, madlib.logregr_predict(coef, ARRAY[1, treatment, trait_anxiety]),\n", " p.second_attack::BOOLEAN\n", "FROM patients p, patients_logregr m\n", "ORDER BY p.id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Predict probability\n", "Predicting the probability of the dependent variable being TRUE." ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>logregr_predict_prob</th>\n", " <th>second_attack</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>0.720223028942</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>0.894354902502</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>0.192269541755</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>0.685513072239</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>0.167747881509</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>0.798098108915</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>0.928568075753</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>0.959305763694</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>0.877576117431</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>0.685513072239</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>0.586700895943</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>0.192269541755</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>0.116032010633</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>0.0383829143135</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>0.0674976224148</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>0.192269541755</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>17</td>\n", " <td>0.545870774303</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>18</td>\n", " <td>0.267675422387</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>19</td>\n", " <td>0.398618639285</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <td>20</td>\n", " <td>0.685513072239</td>\n", " <td>False</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, 0.720223028941527, True),\n", " (2, 0.894354902502048, True),\n", " (3, 0.192269541755171, True),\n", " (4, 0.685513072239347, True),\n", " (5, 0.167747881508857, True),\n", " (6, 0.79809810891514, True),\n", " (7, 0.928568075752503, True),\n", " (8, 0.959305763693571, True),\n", " (9, 0.877576117431452, True),\n", " (10, 0.685513072239347, True),\n", " (11, 0.586700895943317, False),\n", " (12, 0.192269541755171, False),\n", " (13, 0.116032010632994, False),\n", " (14, 0.0383829143134982, False),\n", " (15, 0.0674976224147597, False),\n", " (16, 0.192269541755171, False),\n", " (17, 0.545870774302621, False),\n", " (18, 0.267675422387132, False),\n", " (19, 0.398618639285111, False),\n", " (20, 0.685513072239347, False)]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "-- Display prediction value along with the original value\n", "SELECT p.id, madlib.logregr_predict_prob(coef, ARRAY[1, treatment, trait_anxiety]),\n", " p.second_attack::BOOLEAN\n", "FROM patients p, patients_logregr m\n", "ORDER BY p.id;" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }