community-artifacts/Statistics/Covariance-and-correlation-v1.ipynb (1,318 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Covariance and Correlation\n", "\n", "Generates a covariance or Pearson correlation matrix for pairs of numeric columns in a table. Grouping added in 1.15." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", " \"You should import from traitlets.config instead.\", ShimWarning)\n", "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" ] } ], "source": [ "%load_ext sql" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "u'Connected: gpadmin@madlib'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Greenplum Database 5.4.0 on GCP (demo machine)\n", "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", " \n", "# PostgreSQL local\n", "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", "\n", "# Greenplum Database 4.3.10.0\n", "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>version</th>\n", " </tr>\n", " <tr>\n", " <td>MADlib version: 1.15-dev, git revision: rc/1.14-rc1-6-g3b80a32, cmake configuration time: Wed May 16 19:29:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-6-g3b80a32, cmake configuration time: Wed May 16 19:29:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql select madlib.version();\n", "#%sql select version();" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Load data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "53 rows affected.\n", "53 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>outlook</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " <th>windy</th>\n", " <th>class</th>\n", " <th>day</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>sunny</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " <td>false</td>\n", " <td>Dont Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>sunny</td>\n", " <td>80.0</td>\n", " <td>90.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>overcast</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>rain</td>\n", " <td>70.0</td>\n", " <td>96.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>rain</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>rain</td>\n", " <td>65.0</td>\n", " <td>70.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>7</td>\n", " <td>overcast</td>\n", " <td>64.0</td>\n", " <td>65.0</td>\n", " <td>true</td>\n", " <td>Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>8</td>\n", " <td>sunny</td>\n", " <td>72.0</td>\n", " <td>95.0</td>\n", " <td>false</td>\n", " <td>Dont Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>9</td>\n", " <td>sunny</td>\n", " <td>69.0</td>\n", " <td>70.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>10</td>\n", " <td>rain</td>\n", " <td>75.0</td>\n", " <td>80.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>11</td>\n", " <td>sunny</td>\n", " <td>75.0</td>\n", " <td>70.0</td>\n", " <td>true</td>\n", " <td>Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>12</td>\n", " <td>overcast</td>\n", " <td>72.0</td>\n", " <td>90.0</td>\n", " <td>true</td>\n", " <td>Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>13</td>\n", " <td>overcast</td>\n", " <td>81.0</td>\n", " <td>75.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>14</td>\n", " <td>rain</td>\n", " <td>71.0</td>\n", " <td>80.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>15</td>\n", " <td>None</td>\n", " <td>100.0</td>\n", " <td>100.0</td>\n", " <td>true</td>\n", " <td>None</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>16</td>\n", " <td>None</td>\n", " <td>110.0</td>\n", " <td>100.0</td>\n", " <td>true</td>\n", " <td>None</td>\n", " <td>Mon</td>\n", " </tr>\n", " <tr>\n", " <td>101</td>\n", " <td>sunny</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " <td>false</td>\n", " <td>Dont Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>102</td>\n", " <td>sunny</td>\n", " <td>80.0</td>\n", " <td>90.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>103</td>\n", " <td>overcast</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>104</td>\n", " <td>rain</td>\n", " <td>70.0</td>\n", " <td>96.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>105</td>\n", " <td>rain</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>106</td>\n", " <td>rain</td>\n", " <td>65.0</td>\n", " <td>70.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>107</td>\n", " <td>overcast</td>\n", " <td>64.0</td>\n", " <td>65.0</td>\n", " <td>true</td>\n", " <td>Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>108</td>\n", " <td>sunny</td>\n", " <td>72.0</td>\n", " <td>95.0</td>\n", " <td>false</td>\n", " <td>Dont Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>109</td>\n", " <td>sunny</td>\n", " <td>69.0</td>\n", " <td>70.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>110</td>\n", " <td>rain</td>\n", " <td>75.0</td>\n", " <td>80.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>111</td>\n", " <td>sunny</td>\n", " <td>75.0</td>\n", " <td>70.0</td>\n", " <td>true</td>\n", " <td>Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>112</td>\n", " <td>overcast</td>\n", " <td>72.0</td>\n", " <td>90.0</td>\n", " <td>true</td>\n", " <td>Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>113</td>\n", " <td>overcast</td>\n", " <td>81.0</td>\n", " <td>75.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>114</td>\n", " <td>rain</td>\n", " <td>71.0</td>\n", " <td>80.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>115</td>\n", " <td>None</td>\n", " <td>100.0</td>\n", " <td>100.0</td>\n", " <td>true</td>\n", " <td>None</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>116</td>\n", " <td>None</td>\n", " <td>110.0</td>\n", " <td>100.0</td>\n", " <td>true</td>\n", " <td>None</td>\n", " <td>Tues</td>\n", " </tr>\n", " <tr>\n", " <td>201</td>\n", " <td>sunny</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " <td>false</td>\n", " <td>Dont Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>202</td>\n", " <td>sunny</td>\n", " <td>80.0</td>\n", " <td>90.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>203</td>\n", " <td>overcast</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>204</td>\n", " <td>rain</td>\n", " <td>70.0</td>\n", " <td>96.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>205</td>\n", " <td>rain</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>206</td>\n", " <td>rain</td>\n", " <td>65.0</td>\n", " <td>70.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>207</td>\n", " <td>overcast</td>\n", " <td>64.0</td>\n", " <td>65.0</td>\n", " <td>true</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>208</td>\n", " <td>sunny</td>\n", " <td>7.0</td>\n", " <td>95.0</td>\n", " <td>false</td>\n", " <td>Dont Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>209</td>\n", " <td>sunny</td>\n", " <td>6.0</td>\n", " <td>70.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>210</td>\n", " <td>rain</td>\n", " <td>7.0</td>\n", " <td>80.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>211</td>\n", " <td>sunny</td>\n", " <td>75.0</td>\n", " <td>70.0</td>\n", " <td>true</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>212</td>\n", " <td>overcast</td>\n", " <td>72.0</td>\n", " <td>90.0</td>\n", " <td>true</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>213</td>\n", " <td>overcast</td>\n", " <td>81.0</td>\n", " <td>75.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>214</td>\n", " <td>rain</td>\n", " <td>71.0</td>\n", " <td>80.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>215</td>\n", " <td>None</td>\n", " <td>10.0</td>\n", " <td>100.0</td>\n", " <td>true</td>\n", " <td>None</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>216</td>\n", " <td>None</td>\n", " <td>10.0</td>\n", " <td>100.0</td>\n", " <td>true</td>\n", " <td>None</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>217</td>\n", " <td>sunny</td>\n", " <td>85.0</td>\n", " <td>85.0</td>\n", " <td>false</td>\n", " <td>Dont Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>218</td>\n", " <td>sunny</td>\n", " <td>80.0</td>\n", " <td>9.0</td>\n", " <td>true</td>\n", " <td>Dont Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>219</td>\n", " <td>overcast</td>\n", " <td>83.0</td>\n", " <td>78.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>220</td>\n", " <td>rain</td>\n", " <td>70.0</td>\n", " <td>9.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", " <tr>\n", " <td>221</td>\n", " <td>rain</td>\n", " <td>68.0</td>\n", " <td>80.0</td>\n", " <td>false</td>\n", " <td>Play</td>\n", " <td>Wed</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'sunny', 85.0, 85.0, u'false', u'Dont Play', u'Mon'),\n", " (2, u'sunny', 80.0, 90.0, u'true', u'Dont Play', u'Mon'),\n", " (3, u'overcast', 83.0, 78.0, u'false', u'Play', u'Mon'),\n", " (4, u'rain', 70.0, 96.0, u'false', u'Play', u'Mon'),\n", " (5, u'rain', 68.0, 80.0, u'false', u'Play', u'Mon'),\n", " (6, u'rain', 65.0, 70.0, u'true', u'Dont Play', u'Mon'),\n", " (7, u'overcast', 64.0, 65.0, u'true', u'Play', u'Mon'),\n", " (8, u'sunny', 72.0, 95.0, u'false', u'Dont Play', u'Mon'),\n", " (9, u'sunny', 69.0, 70.0, u'false', u'Play', u'Mon'),\n", " (10, u'rain', 75.0, 80.0, u'false', u'Play', u'Mon'),\n", " (11, u'sunny', 75.0, 70.0, u'true', u'Play', u'Mon'),\n", " (12, u'overcast', 72.0, 90.0, u'true', u'Play', u'Mon'),\n", " (13, u'overcast', 81.0, 75.0, u'false', u'Play', u'Mon'),\n", " (14, u'rain', 71.0, 80.0, u'true', u'Dont Play', u'Mon'),\n", " (15, None, 100.0, 100.0, u'true', None, u'Mon'),\n", " (16, None, 110.0, 100.0, u'true', None, u'Mon'),\n", " (101, u'sunny', 85.0, 85.0, u'false', u'Dont Play', u'Tues'),\n", " (102, u'sunny', 80.0, 90.0, u'true', u'Dont Play', u'Tues'),\n", " (103, u'overcast', 83.0, 78.0, u'false', u'Play', u'Tues'),\n", " (104, u'rain', 70.0, 96.0, u'false', u'Play', u'Tues'),\n", " (105, u'rain', 68.0, 80.0, u'false', u'Play', u'Tues'),\n", " (106, u'rain', 65.0, 70.0, u'true', u'Dont Play', u'Tues'),\n", " (107, u'overcast', 64.0, 65.0, u'true', u'Play', u'Tues'),\n", " (108, u'sunny', 72.0, 95.0, u'false', u'Dont Play', u'Tues'),\n", " (109, u'sunny', 69.0, 70.0, u'false', u'Play', u'Tues'),\n", " (110, u'rain', 75.0, 80.0, u'false', u'Play', u'Tues'),\n", " (111, u'sunny', 75.0, 70.0, u'true', u'Play', u'Tues'),\n", " (112, u'overcast', 72.0, 90.0, u'true', u'Play', u'Tues'),\n", " (113, u'overcast', 81.0, 75.0, u'false', u'Play', u'Tues'),\n", " (114, u'rain', 71.0, 80.0, u'true', u'Dont Play', u'Tues'),\n", " (115, None, 100.0, 100.0, u'true', None, u'Tues'),\n", " (116, None, 110.0, 100.0, u'true', None, u'Tues'),\n", " (201, u'sunny', 85.0, 85.0, u'false', u'Dont Play', u'Wed'),\n", " (202, u'sunny', 80.0, 90.0, u'true', u'Dont Play', u'Wed'),\n", " (203, u'overcast', 83.0, 78.0, u'false', u'Play', u'Wed'),\n", " (204, u'rain', 70.0, 96.0, u'false', u'Play', u'Wed'),\n", " (205, u'rain', 68.0, 80.0, u'false', u'Play', u'Wed'),\n", " (206, u'rain', 65.0, 70.0, u'true', u'Dont Play', u'Wed'),\n", " (207, u'overcast', 64.0, 65.0, u'true', u'Play', u'Wed'),\n", " (208, u'sunny', 7.0, 95.0, u'false', u'Dont Play', u'Wed'),\n", " (209, u'sunny', 6.0, 70.0, u'false', u'Play', u'Wed'),\n", " (210, u'rain', 7.0, 80.0, u'false', u'Play', u'Wed'),\n", " (211, u'sunny', 75.0, 70.0, u'true', u'Play', u'Wed'),\n", " (212, u'overcast', 72.0, 90.0, u'true', u'Play', u'Wed'),\n", " (213, u'overcast', 81.0, 75.0, u'false', u'Play', u'Wed'),\n", " (214, u'rain', 71.0, 80.0, u'true', u'Dont Play', u'Wed'),\n", " (215, None, 10.0, 100.0, u'true', None, u'Wed'),\n", " (216, None, 10.0, 100.0, u'true', None, u'Wed'),\n", " (217, u'sunny', 85.0, 85.0, u'false', u'Dont Play', u'Wed'),\n", " (218, u'sunny', 80.0, 9.0, u'true', u'Dont Play', u'Wed'),\n", " (219, u'overcast', 83.0, 78.0, u'false', u'Play', u'Wed'),\n", " (220, u'rain', 70.0, 9.0, u'false', u'Play', u'Wed'),\n", " (221, u'rain', 68.0, 80.0, u'false', u'Play', u'Wed')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS example_data CASCADE;\n", "\n", "CREATE TABLE example_data( \n", " id SERIAL, \n", " outlook TEXT,\n", " temperature FLOAT8, \n", " humidity FLOAT8,\n", " windy TEXT, \n", " class TEXT,\n", " day TEXT\n", ");\n", "\n", "INSERT INTO example_data VALUES\n", "(1, 'sunny', 85, 85, 'false', 'Dont Play', 'Mon'),\n", "(2, 'sunny', 80, 90, 'true', 'Dont Play', 'Mon'),\n", "(3, 'overcast', 83, 78, 'false', 'Play', 'Mon'),\n", "(4, 'rain', 70, 96, 'false', 'Play', 'Mon'),\n", "(5, 'rain', 68, 80, 'false', 'Play', 'Mon'),\n", "(6, 'rain', 65, 70, 'true', 'Dont Play', 'Mon'),\n", "(7, 'overcast', 64, 65, 'true', 'Play', 'Mon'),\n", "(8, 'sunny', 72, 95, 'false', 'Dont Play', 'Mon'),\n", "(9, 'sunny', 69, 70, 'false', 'Play', 'Mon'),\n", "(10, 'rain', 75, 80, 'false', 'Play', 'Mon'),\n", "(11, 'sunny', 75, 70, 'true', 'Play', 'Mon'),\n", "(12, 'overcast', 72, 90, 'true', 'Play', 'Mon'),\n", "(13, 'overcast', 81, 75, 'false', 'Play', 'Mon'),\n", "(14, 'rain', 71, 80, 'true', 'Dont Play', 'Mon'),\n", "(15, NULL, 100, 100, 'true', NULL, 'Mon'),\n", "(16, NULL, 110, 100, 'true', NULL, 'Mon'),\n", "(101, 'sunny', 85, 85, 'false', 'Dont Play', 'Tues'),\n", "(102, 'sunny', 80, 90, 'true', 'Dont Play', 'Tues'),\n", "(103, 'overcast', 83, 78, 'false', 'Play', 'Tues'),\n", "(104, 'rain', 70, 96, 'false', 'Play', 'Tues'),\n", "(105, 'rain', 68, 80, 'false', 'Play', 'Tues'),\n", "(106, 'rain', 65, 70, 'true', 'Dont Play', 'Tues'),\n", "(107, 'overcast', 64, 65, 'true', 'Play', 'Tues'),\n", "(108, 'sunny', 72, 95, 'false', 'Dont Play', 'Tues'),\n", "(109, 'sunny', 69, 70, 'false', 'Play', 'Tues'),\n", "(110, 'rain', 75, 80, 'false', 'Play', 'Tues'),\n", "(111, 'sunny', 75, 70, 'true', 'Play', 'Tues'),\n", "(112, 'overcast', 72, 90, 'true', 'Play', 'Tues'),\n", "(113, 'overcast', 81, 75, 'false', 'Play', 'Tues'),\n", "(114, 'rain', 71, 80, 'true', 'Dont Play', 'Tues'),\n", "(115, NULL, 100, 100, 'true', NULL, 'Tues'),\n", "(116, NULL, 110, 100, 'true', NULL, 'Tues'),\n", "(201, 'sunny', 85, 85, 'false', 'Dont Play', 'Wed'),\n", "(202, 'sunny', 80, 90, 'true', 'Dont Play', 'Wed'),\n", "(203, 'overcast', 83, 78, 'false', 'Play', 'Wed'),\n", "(204, 'rain', 70, 96, 'false', 'Play', 'Wed'),\n", "(205, 'rain', 68, 80, 'false', 'Play', 'Wed'),\n", "(206, 'rain', 65, 70, 'true', 'Dont Play', 'Wed'),\n", "(207, 'overcast', 64, 65, 'true', 'Play', 'Wed'),\n", "(208, 'sunny', 7, 95, 'false', 'Dont Play', 'Wed'),\n", "(209, 'sunny', 6, 70, 'false', 'Play', 'Wed'),\n", "(210, 'rain', 7, 80, 'false', 'Play', 'Wed'),\n", "(211, 'sunny', 75, 70, 'true', 'Play', 'Wed'),\n", "(212, 'overcast', 72, 90, 'true', 'Play', 'Wed'),\n", "(213, 'overcast', 81, 75, 'false', 'Play', 'Wed'),\n", "(214, 'rain', 71, 80, 'true', 'Dont Play', 'Wed'),\n", "(215, NULL, 10, 100, 'true', NULL, 'Wed'),\n", "(216, NULL, 10, 100, 'true', NULL, 'Wed'),\n", "(217, 'sunny', 85, 85, 'false', 'Dont Play', 'Wed'),\n", "(218, 'sunny', 80, 9, 'true', 'Dont Play', 'Wed'),\n", "(219, 'overcast', 83, 78, 'false', 'Play', 'Wed'),\n", "(220, 'rain', 70, 9, 'false', 'Play', 'Wed'),\n", "(221, 'rain', 68, 80, 'false', 'Play', 'Wed');\n", "\n", "SELECT * FROM example_data ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Correlation\n", "Get correlation between temperature and humidity:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>correlation</th>\n", " </tr>\n", " <tr>\n", " <td>Summary for 'Correlation' function<br>Output table = example_data_output<br>Producing correlation for columns: temperature,humidity<br>Total run time = ('example_data_output', 2, 0.26578783988952637)</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u\"Summary for 'Correlation' function\\nOutput table = example_data_output\\nProducing correlation for columns: temperature,humidity\\nTotal run time = ('example_data_output', 2, 0.26578783988952637)\",)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS example_data_output, example_data_output_summary;\n", "SELECT madlib.correlation( 'example_data',\n", " 'example_data_output',\n", " 'temperature, humidity'\n", " );" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View the correlation matrix:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>column_position</th>\n", " <th>variable</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>temperature</td>\n", " <td>1.0</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>humidity</td>\n", " <td>0.00607993890409</td>\n", " <td>1.0</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'temperature', 1.0, None), (2, u'humidity', 0.00607993890408995, 1.0)]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM example_data_output ORDER BY column_position; " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View the summary table:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>method</th>\n", " <th>source</th>\n", " <th>output_table</th>\n", " <th>column_names</th>\n", " <th>mean_vector</th>\n", " <th>total_rows_processed</th>\n", " </tr>\n", " <tr>\n", " <td>Correlation</td>\n", " <td>example_data</td>\n", " <td>example_data_output</td>\n", " <td>[u'temperature', u'humidity']</td>\n", " <td>[70.188679245283, 79.8679245283019]</td>\n", " <td>53</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'Correlation', u'example_data', u'example_data_output', [u'temperature', u'humidity'], [70.188679245283, 79.8679245283019], 53L)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM example_data_output_summary;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Correlation with grouping\n", "Run the correlation by day:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>correlation</th>\n", " </tr>\n", " <tr>\n", " <td>Summary for 'Correlation' function<br>Output table = example_data_output<br>Grouping columns: day<br>Producing correlation for columns: temperature,humidity<br>Total run time = ('example_data_output', 2, 0.22429895401000977)</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u\"Summary for 'Correlation' function\\nOutput table = example_data_output\\nGrouping columns: day\\nProducing correlation for columns: temperature,humidity\\nTotal run time = ('example_data_output', 2, 0.22429895401000977)\",)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS example_data_output, example_data_output_summary;\n", "SELECT madlib.correlation( 'example_data',\n", " 'example_data_output',\n", " 'temperature, humidity',\n", " FALSE,\n", " 'day'\n", " );" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View the correlation matrix by group:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>column_position</th>\n", " <th>variable</th>\n", " <th>day</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>temperature</td>\n", " <td>Mon</td>\n", " <td>1.0</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>humidity</td>\n", " <td>Mon</td>\n", " <td>0.616876934549</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>temperature</td>\n", " <td>Tues</td>\n", " <td>1.0</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>humidity</td>\n", " <td>Tues</td>\n", " <td>0.616876934549</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>temperature</td>\n", " <td>Wed</td>\n", " <td>1.0</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>humidity</td>\n", " <td>Wed</td>\n", " <td>-0.289696693685</td>\n", " <td>1.0</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'temperature', u'Mon', 1.0, None),\n", " (2, u'humidity', u'Mon', 0.616876934548786, 1.0),\n", " (1, u'temperature', u'Tues', 1.0, None),\n", " (2, u'humidity', u'Tues', 0.616876934548786, 1.0),\n", " (1, u'temperature', u'Wed', 1.0, None),\n", " (2, u'humidity', u'Wed', -0.28969669368457, 1.0)]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM example_data_output ORDER BY day, column_position;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View the summary table:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>method</th>\n", " <th>source</th>\n", " <th>output_table</th>\n", " <th>column_names</th>\n", " <th>day</th>\n", " <th>mean_vector</th>\n", " <th>total_rows_processed</th>\n", " </tr>\n", " <tr>\n", " <td>Correlation</td>\n", " <td>example_data</td>\n", " <td>example_data_output</td>\n", " <td>[u'temperature', u'humidity']</td>\n", " <td>Mon</td>\n", " <td>[77.5, 82.75]</td>\n", " <td>16</td>\n", " </tr>\n", " <tr>\n", " <td>Correlation</td>\n", " <td>example_data</td>\n", " <td>example_data_output</td>\n", " <td>[u'temperature', u'humidity']</td>\n", " <td>Tues</td>\n", " <td>[77.5, 82.75]</td>\n", " <td>16</td>\n", " </tr>\n", " <tr>\n", " <td>Correlation</td>\n", " <td>example_data</td>\n", " <td>example_data_output</td>\n", " <td>[u'temperature', u'humidity']</td>\n", " <td>Wed</td>\n", " <td>[59.0476190476191, 75.4761904761905]</td>\n", " <td>21</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'Correlation', u'example_data', u'example_data_output', [u'temperature', u'humidity'], u'Mon', [77.5, 82.75], 16L),\n", " (u'Correlation', u'example_data', u'example_data_output', [u'temperature', u'humidity'], u'Tues', [77.5, 82.75], 16L),\n", " (u'Correlation', u'example_data', u'example_data_output', [u'temperature', u'humidity'], u'Wed', [59.0476190476191, 75.4761904761905], 21L)]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM example_data_output_summary ORDER BY day;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Covariance\n", "Get covariance between temperature and humidity:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>covariance</th>\n", " </tr>\n", " <tr>\n", " <td>Summary for 'Covariance' function<br>Output table = example_data_output<br>Producing covariance for columns: temperature,humidity<br>Total run time = ('example_data_output', 2, 0.1925361156463623)</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u\"Summary for 'Covariance' function\\nOutput table = example_data_output\\nProducing covariance for columns: temperature,humidity\\nTotal run time = ('example_data_output', 2, 0.1925361156463623)\",)]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS example_data_output, example_data_output_summary;\n", "SELECT madlib.covariance( 'example_data',\n", " 'example_data_output',\n", " 'temperature, humidity'\n", " );" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View covariance matrix:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>column_position</th>\n", " <th>variable</th>\n", " <th>temperature</th>\n", " <th>humidity</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>temperature</td>\n", " <td>507.926664293</td>\n", " <td>None</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>humidity</td>\n", " <td>2.40227839089</td>\n", " <td>307.35991456</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, u'temperature', 507.926664293343, None),\n", " (2, u'humidity', 2.40227839088644, 307.359914560342)]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM example_data_output ORDER BY column_position; " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View the summary table:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>method</th>\n", " <th>source</th>\n", " <th>output_table</th>\n", " <th>column_names</th>\n", " <th>mean_vector</th>\n", " <th>total_rows_processed</th>\n", " </tr>\n", " <tr>\n", " <td>Covariance</td>\n", " <td>example_data</td>\n", " <td>example_data_output</td>\n", " <td>[u'temperature', u'humidity']</td>\n", " <td>[70.188679245283, 79.8679245283019]</td>\n", " <td>53</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'Covariance', u'example_data', u'example_data_output', [u'temperature', u'humidity'], [70.188679245283, 79.8679245283019], 53L)]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM example_data_output_summary;" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }