community-artifacts/Statistics/Covariance-and-correlation-v1.ipynb (1,318 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Covariance and Correlation\n",
"\n",
"Generates a covariance or Pearson correlation matrix for pairs of numeric columns in a table. Grouping added in 1.15."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
" \"You should import from traitlets.config instead.\", ShimWarning)\n",
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
" warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
]
}
],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"u'Connected: gpadmin@madlib'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
" \n",
"# PostgreSQL local\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
"\n",
"# Greenplum Database 4.3.10.0\n",
"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.15-dev, git revision: rc/1.14-rc1-6-g3b80a32, cmake configuration time: Wed May 16 19:29:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-6-g3b80a32, cmake configuration time: Wed May 16 19:29:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Load data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"53 rows affected.\n",
"53 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>outlook</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" <th>windy</th>\n",
" <th>class</th>\n",
" <th>day</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>sunny</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>false</td>\n",
" <td>Dont Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>sunny</td>\n",
" <td>80.0</td>\n",
" <td>90.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>overcast</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>rain</td>\n",
" <td>70.0</td>\n",
" <td>96.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>rain</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>rain</td>\n",
" <td>65.0</td>\n",
" <td>70.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>overcast</td>\n",
" <td>64.0</td>\n",
" <td>65.0</td>\n",
" <td>true</td>\n",
" <td>Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>sunny</td>\n",
" <td>72.0</td>\n",
" <td>95.0</td>\n",
" <td>false</td>\n",
" <td>Dont Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>sunny</td>\n",
" <td>69.0</td>\n",
" <td>70.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>rain</td>\n",
" <td>75.0</td>\n",
" <td>80.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>sunny</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>true</td>\n",
" <td>Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>overcast</td>\n",
" <td>72.0</td>\n",
" <td>90.0</td>\n",
" <td>true</td>\n",
" <td>Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>overcast</td>\n",
" <td>81.0</td>\n",
" <td>75.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>rain</td>\n",
" <td>71.0</td>\n",
" <td>80.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>None</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" <td>true</td>\n",
" <td>None</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>None</td>\n",
" <td>110.0</td>\n",
" <td>100.0</td>\n",
" <td>true</td>\n",
" <td>None</td>\n",
" <td>Mon</td>\n",
" </tr>\n",
" <tr>\n",
" <td>101</td>\n",
" <td>sunny</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>false</td>\n",
" <td>Dont Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>102</td>\n",
" <td>sunny</td>\n",
" <td>80.0</td>\n",
" <td>90.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>103</td>\n",
" <td>overcast</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>104</td>\n",
" <td>rain</td>\n",
" <td>70.0</td>\n",
" <td>96.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>105</td>\n",
" <td>rain</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>106</td>\n",
" <td>rain</td>\n",
" <td>65.0</td>\n",
" <td>70.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>107</td>\n",
" <td>overcast</td>\n",
" <td>64.0</td>\n",
" <td>65.0</td>\n",
" <td>true</td>\n",
" <td>Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>108</td>\n",
" <td>sunny</td>\n",
" <td>72.0</td>\n",
" <td>95.0</td>\n",
" <td>false</td>\n",
" <td>Dont Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>109</td>\n",
" <td>sunny</td>\n",
" <td>69.0</td>\n",
" <td>70.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>110</td>\n",
" <td>rain</td>\n",
" <td>75.0</td>\n",
" <td>80.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>111</td>\n",
" <td>sunny</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>true</td>\n",
" <td>Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>112</td>\n",
" <td>overcast</td>\n",
" <td>72.0</td>\n",
" <td>90.0</td>\n",
" <td>true</td>\n",
" <td>Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>113</td>\n",
" <td>overcast</td>\n",
" <td>81.0</td>\n",
" <td>75.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>114</td>\n",
" <td>rain</td>\n",
" <td>71.0</td>\n",
" <td>80.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>115</td>\n",
" <td>None</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" <td>true</td>\n",
" <td>None</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>116</td>\n",
" <td>None</td>\n",
" <td>110.0</td>\n",
" <td>100.0</td>\n",
" <td>true</td>\n",
" <td>None</td>\n",
" <td>Tues</td>\n",
" </tr>\n",
" <tr>\n",
" <td>201</td>\n",
" <td>sunny</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>false</td>\n",
" <td>Dont Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>202</td>\n",
" <td>sunny</td>\n",
" <td>80.0</td>\n",
" <td>90.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>203</td>\n",
" <td>overcast</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>204</td>\n",
" <td>rain</td>\n",
" <td>70.0</td>\n",
" <td>96.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>205</td>\n",
" <td>rain</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>206</td>\n",
" <td>rain</td>\n",
" <td>65.0</td>\n",
" <td>70.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>207</td>\n",
" <td>overcast</td>\n",
" <td>64.0</td>\n",
" <td>65.0</td>\n",
" <td>true</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>208</td>\n",
" <td>sunny</td>\n",
" <td>7.0</td>\n",
" <td>95.0</td>\n",
" <td>false</td>\n",
" <td>Dont Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>209</td>\n",
" <td>sunny</td>\n",
" <td>6.0</td>\n",
" <td>70.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>210</td>\n",
" <td>rain</td>\n",
" <td>7.0</td>\n",
" <td>80.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>211</td>\n",
" <td>sunny</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>true</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>212</td>\n",
" <td>overcast</td>\n",
" <td>72.0</td>\n",
" <td>90.0</td>\n",
" <td>true</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>213</td>\n",
" <td>overcast</td>\n",
" <td>81.0</td>\n",
" <td>75.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>214</td>\n",
" <td>rain</td>\n",
" <td>71.0</td>\n",
" <td>80.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>215</td>\n",
" <td>None</td>\n",
" <td>10.0</td>\n",
" <td>100.0</td>\n",
" <td>true</td>\n",
" <td>None</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>216</td>\n",
" <td>None</td>\n",
" <td>10.0</td>\n",
" <td>100.0</td>\n",
" <td>true</td>\n",
" <td>None</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>217</td>\n",
" <td>sunny</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>false</td>\n",
" <td>Dont Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>218</td>\n",
" <td>sunny</td>\n",
" <td>80.0</td>\n",
" <td>9.0</td>\n",
" <td>true</td>\n",
" <td>Dont Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>219</td>\n",
" <td>overcast</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>220</td>\n",
" <td>rain</td>\n",
" <td>70.0</td>\n",
" <td>9.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
" <tr>\n",
" <td>221</td>\n",
" <td>rain</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" <td>false</td>\n",
" <td>Play</td>\n",
" <td>Wed</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'sunny', 85.0, 85.0, u'false', u'Dont Play', u'Mon'),\n",
" (2, u'sunny', 80.0, 90.0, u'true', u'Dont Play', u'Mon'),\n",
" (3, u'overcast', 83.0, 78.0, u'false', u'Play', u'Mon'),\n",
" (4, u'rain', 70.0, 96.0, u'false', u'Play', u'Mon'),\n",
" (5, u'rain', 68.0, 80.0, u'false', u'Play', u'Mon'),\n",
" (6, u'rain', 65.0, 70.0, u'true', u'Dont Play', u'Mon'),\n",
" (7, u'overcast', 64.0, 65.0, u'true', u'Play', u'Mon'),\n",
" (8, u'sunny', 72.0, 95.0, u'false', u'Dont Play', u'Mon'),\n",
" (9, u'sunny', 69.0, 70.0, u'false', u'Play', u'Mon'),\n",
" (10, u'rain', 75.0, 80.0, u'false', u'Play', u'Mon'),\n",
" (11, u'sunny', 75.0, 70.0, u'true', u'Play', u'Mon'),\n",
" (12, u'overcast', 72.0, 90.0, u'true', u'Play', u'Mon'),\n",
" (13, u'overcast', 81.0, 75.0, u'false', u'Play', u'Mon'),\n",
" (14, u'rain', 71.0, 80.0, u'true', u'Dont Play', u'Mon'),\n",
" (15, None, 100.0, 100.0, u'true', None, u'Mon'),\n",
" (16, None, 110.0, 100.0, u'true', None, u'Mon'),\n",
" (101, u'sunny', 85.0, 85.0, u'false', u'Dont Play', u'Tues'),\n",
" (102, u'sunny', 80.0, 90.0, u'true', u'Dont Play', u'Tues'),\n",
" (103, u'overcast', 83.0, 78.0, u'false', u'Play', u'Tues'),\n",
" (104, u'rain', 70.0, 96.0, u'false', u'Play', u'Tues'),\n",
" (105, u'rain', 68.0, 80.0, u'false', u'Play', u'Tues'),\n",
" (106, u'rain', 65.0, 70.0, u'true', u'Dont Play', u'Tues'),\n",
" (107, u'overcast', 64.0, 65.0, u'true', u'Play', u'Tues'),\n",
" (108, u'sunny', 72.0, 95.0, u'false', u'Dont Play', u'Tues'),\n",
" (109, u'sunny', 69.0, 70.0, u'false', u'Play', u'Tues'),\n",
" (110, u'rain', 75.0, 80.0, u'false', u'Play', u'Tues'),\n",
" (111, u'sunny', 75.0, 70.0, u'true', u'Play', u'Tues'),\n",
" (112, u'overcast', 72.0, 90.0, u'true', u'Play', u'Tues'),\n",
" (113, u'overcast', 81.0, 75.0, u'false', u'Play', u'Tues'),\n",
" (114, u'rain', 71.0, 80.0, u'true', u'Dont Play', u'Tues'),\n",
" (115, None, 100.0, 100.0, u'true', None, u'Tues'),\n",
" (116, None, 110.0, 100.0, u'true', None, u'Tues'),\n",
" (201, u'sunny', 85.0, 85.0, u'false', u'Dont Play', u'Wed'),\n",
" (202, u'sunny', 80.0, 90.0, u'true', u'Dont Play', u'Wed'),\n",
" (203, u'overcast', 83.0, 78.0, u'false', u'Play', u'Wed'),\n",
" (204, u'rain', 70.0, 96.0, u'false', u'Play', u'Wed'),\n",
" (205, u'rain', 68.0, 80.0, u'false', u'Play', u'Wed'),\n",
" (206, u'rain', 65.0, 70.0, u'true', u'Dont Play', u'Wed'),\n",
" (207, u'overcast', 64.0, 65.0, u'true', u'Play', u'Wed'),\n",
" (208, u'sunny', 7.0, 95.0, u'false', u'Dont Play', u'Wed'),\n",
" (209, u'sunny', 6.0, 70.0, u'false', u'Play', u'Wed'),\n",
" (210, u'rain', 7.0, 80.0, u'false', u'Play', u'Wed'),\n",
" (211, u'sunny', 75.0, 70.0, u'true', u'Play', u'Wed'),\n",
" (212, u'overcast', 72.0, 90.0, u'true', u'Play', u'Wed'),\n",
" (213, u'overcast', 81.0, 75.0, u'false', u'Play', u'Wed'),\n",
" (214, u'rain', 71.0, 80.0, u'true', u'Dont Play', u'Wed'),\n",
" (215, None, 10.0, 100.0, u'true', None, u'Wed'),\n",
" (216, None, 10.0, 100.0, u'true', None, u'Wed'),\n",
" (217, u'sunny', 85.0, 85.0, u'false', u'Dont Play', u'Wed'),\n",
" (218, u'sunny', 80.0, 9.0, u'true', u'Dont Play', u'Wed'),\n",
" (219, u'overcast', 83.0, 78.0, u'false', u'Play', u'Wed'),\n",
" (220, u'rain', 70.0, 9.0, u'false', u'Play', u'Wed'),\n",
" (221, u'rain', 68.0, 80.0, u'false', u'Play', u'Wed')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS example_data CASCADE;\n",
"\n",
"CREATE TABLE example_data( \n",
" id SERIAL, \n",
" outlook TEXT,\n",
" temperature FLOAT8, \n",
" humidity FLOAT8,\n",
" windy TEXT, \n",
" class TEXT,\n",
" day TEXT\n",
");\n",
"\n",
"INSERT INTO example_data VALUES\n",
"(1, 'sunny', 85, 85, 'false', 'Dont Play', 'Mon'),\n",
"(2, 'sunny', 80, 90, 'true', 'Dont Play', 'Mon'),\n",
"(3, 'overcast', 83, 78, 'false', 'Play', 'Mon'),\n",
"(4, 'rain', 70, 96, 'false', 'Play', 'Mon'),\n",
"(5, 'rain', 68, 80, 'false', 'Play', 'Mon'),\n",
"(6, 'rain', 65, 70, 'true', 'Dont Play', 'Mon'),\n",
"(7, 'overcast', 64, 65, 'true', 'Play', 'Mon'),\n",
"(8, 'sunny', 72, 95, 'false', 'Dont Play', 'Mon'),\n",
"(9, 'sunny', 69, 70, 'false', 'Play', 'Mon'),\n",
"(10, 'rain', 75, 80, 'false', 'Play', 'Mon'),\n",
"(11, 'sunny', 75, 70, 'true', 'Play', 'Mon'),\n",
"(12, 'overcast', 72, 90, 'true', 'Play', 'Mon'),\n",
"(13, 'overcast', 81, 75, 'false', 'Play', 'Mon'),\n",
"(14, 'rain', 71, 80, 'true', 'Dont Play', 'Mon'),\n",
"(15, NULL, 100, 100, 'true', NULL, 'Mon'),\n",
"(16, NULL, 110, 100, 'true', NULL, 'Mon'),\n",
"(101, 'sunny', 85, 85, 'false', 'Dont Play', 'Tues'),\n",
"(102, 'sunny', 80, 90, 'true', 'Dont Play', 'Tues'),\n",
"(103, 'overcast', 83, 78, 'false', 'Play', 'Tues'),\n",
"(104, 'rain', 70, 96, 'false', 'Play', 'Tues'),\n",
"(105, 'rain', 68, 80, 'false', 'Play', 'Tues'),\n",
"(106, 'rain', 65, 70, 'true', 'Dont Play', 'Tues'),\n",
"(107, 'overcast', 64, 65, 'true', 'Play', 'Tues'),\n",
"(108, 'sunny', 72, 95, 'false', 'Dont Play', 'Tues'),\n",
"(109, 'sunny', 69, 70, 'false', 'Play', 'Tues'),\n",
"(110, 'rain', 75, 80, 'false', 'Play', 'Tues'),\n",
"(111, 'sunny', 75, 70, 'true', 'Play', 'Tues'),\n",
"(112, 'overcast', 72, 90, 'true', 'Play', 'Tues'),\n",
"(113, 'overcast', 81, 75, 'false', 'Play', 'Tues'),\n",
"(114, 'rain', 71, 80, 'true', 'Dont Play', 'Tues'),\n",
"(115, NULL, 100, 100, 'true', NULL, 'Tues'),\n",
"(116, NULL, 110, 100, 'true', NULL, 'Tues'),\n",
"(201, 'sunny', 85, 85, 'false', 'Dont Play', 'Wed'),\n",
"(202, 'sunny', 80, 90, 'true', 'Dont Play', 'Wed'),\n",
"(203, 'overcast', 83, 78, 'false', 'Play', 'Wed'),\n",
"(204, 'rain', 70, 96, 'false', 'Play', 'Wed'),\n",
"(205, 'rain', 68, 80, 'false', 'Play', 'Wed'),\n",
"(206, 'rain', 65, 70, 'true', 'Dont Play', 'Wed'),\n",
"(207, 'overcast', 64, 65, 'true', 'Play', 'Wed'),\n",
"(208, 'sunny', 7, 95, 'false', 'Dont Play', 'Wed'),\n",
"(209, 'sunny', 6, 70, 'false', 'Play', 'Wed'),\n",
"(210, 'rain', 7, 80, 'false', 'Play', 'Wed'),\n",
"(211, 'sunny', 75, 70, 'true', 'Play', 'Wed'),\n",
"(212, 'overcast', 72, 90, 'true', 'Play', 'Wed'),\n",
"(213, 'overcast', 81, 75, 'false', 'Play', 'Wed'),\n",
"(214, 'rain', 71, 80, 'true', 'Dont Play', 'Wed'),\n",
"(215, NULL, 10, 100, 'true', NULL, 'Wed'),\n",
"(216, NULL, 10, 100, 'true', NULL, 'Wed'),\n",
"(217, 'sunny', 85, 85, 'false', 'Dont Play', 'Wed'),\n",
"(218, 'sunny', 80, 9, 'true', 'Dont Play', 'Wed'),\n",
"(219, 'overcast', 83, 78, 'false', 'Play', 'Wed'),\n",
"(220, 'rain', 70, 9, 'false', 'Play', 'Wed'),\n",
"(221, 'rain', 68, 80, 'false', 'Play', 'Wed');\n",
"\n",
"SELECT * FROM example_data ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Correlation\n",
"Get correlation between temperature and humidity:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>correlation</th>\n",
" </tr>\n",
" <tr>\n",
" <td>Summary for 'Correlation' function<br>Output table = example_data_output<br>Producing correlation for columns: temperature,humidity<br>Total run time = ('example_data_output', 2, 0.26578783988952637)</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u\"Summary for 'Correlation' function\\nOutput table = example_data_output\\nProducing correlation for columns: temperature,humidity\\nTotal run time = ('example_data_output', 2, 0.26578783988952637)\",)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS example_data_output, example_data_output_summary;\n",
"SELECT madlib.correlation( 'example_data',\n",
" 'example_data_output',\n",
" 'temperature, humidity'\n",
" );"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the correlation matrix:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>column_position</th>\n",
" <th>variable</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>temperature</td>\n",
" <td>1.0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>humidity</td>\n",
" <td>0.00607993890409</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'temperature', 1.0, None), (2, u'humidity', 0.00607993890408995, 1.0)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM example_data_output ORDER BY column_position; "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the summary table:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>method</th>\n",
" <th>source</th>\n",
" <th>output_table</th>\n",
" <th>column_names</th>\n",
" <th>mean_vector</th>\n",
" <th>total_rows_processed</th>\n",
" </tr>\n",
" <tr>\n",
" <td>Correlation</td>\n",
" <td>example_data</td>\n",
" <td>example_data_output</td>\n",
" <td>[u'temperature', u'humidity']</td>\n",
" <td>[70.188679245283, 79.8679245283019]</td>\n",
" <td>53</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'Correlation', u'example_data', u'example_data_output', [u'temperature', u'humidity'], [70.188679245283, 79.8679245283019], 53L)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM example_data_output_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Correlation with grouping\n",
"Run the correlation by day:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>correlation</th>\n",
" </tr>\n",
" <tr>\n",
" <td>Summary for 'Correlation' function<br>Output table = example_data_output<br>Grouping columns: day<br>Producing correlation for columns: temperature,humidity<br>Total run time = ('example_data_output', 2, 0.22429895401000977)</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u\"Summary for 'Correlation' function\\nOutput table = example_data_output\\nGrouping columns: day\\nProducing correlation for columns: temperature,humidity\\nTotal run time = ('example_data_output', 2, 0.22429895401000977)\",)]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS example_data_output, example_data_output_summary;\n",
"SELECT madlib.correlation( 'example_data',\n",
" 'example_data_output',\n",
" 'temperature, humidity',\n",
" FALSE,\n",
" 'day'\n",
" );"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the correlation matrix by group:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>column_position</th>\n",
" <th>variable</th>\n",
" <th>day</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>temperature</td>\n",
" <td>Mon</td>\n",
" <td>1.0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>humidity</td>\n",
" <td>Mon</td>\n",
" <td>0.616876934549</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>temperature</td>\n",
" <td>Tues</td>\n",
" <td>1.0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>humidity</td>\n",
" <td>Tues</td>\n",
" <td>0.616876934549</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>temperature</td>\n",
" <td>Wed</td>\n",
" <td>1.0</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>humidity</td>\n",
" <td>Wed</td>\n",
" <td>-0.289696693685</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'temperature', u'Mon', 1.0, None),\n",
" (2, u'humidity', u'Mon', 0.616876934548786, 1.0),\n",
" (1, u'temperature', u'Tues', 1.0, None),\n",
" (2, u'humidity', u'Tues', 0.616876934548786, 1.0),\n",
" (1, u'temperature', u'Wed', 1.0, None),\n",
" (2, u'humidity', u'Wed', -0.28969669368457, 1.0)]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM example_data_output ORDER BY day, column_position;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the summary table:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>method</th>\n",
" <th>source</th>\n",
" <th>output_table</th>\n",
" <th>column_names</th>\n",
" <th>day</th>\n",
" <th>mean_vector</th>\n",
" <th>total_rows_processed</th>\n",
" </tr>\n",
" <tr>\n",
" <td>Correlation</td>\n",
" <td>example_data</td>\n",
" <td>example_data_output</td>\n",
" <td>[u'temperature', u'humidity']</td>\n",
" <td>Mon</td>\n",
" <td>[77.5, 82.75]</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Correlation</td>\n",
" <td>example_data</td>\n",
" <td>example_data_output</td>\n",
" <td>[u'temperature', u'humidity']</td>\n",
" <td>Tues</td>\n",
" <td>[77.5, 82.75]</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Correlation</td>\n",
" <td>example_data</td>\n",
" <td>example_data_output</td>\n",
" <td>[u'temperature', u'humidity']</td>\n",
" <td>Wed</td>\n",
" <td>[59.0476190476191, 75.4761904761905]</td>\n",
" <td>21</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'Correlation', u'example_data', u'example_data_output', [u'temperature', u'humidity'], u'Mon', [77.5, 82.75], 16L),\n",
" (u'Correlation', u'example_data', u'example_data_output', [u'temperature', u'humidity'], u'Tues', [77.5, 82.75], 16L),\n",
" (u'Correlation', u'example_data', u'example_data_output', [u'temperature', u'humidity'], u'Wed', [59.0476190476191, 75.4761904761905], 21L)]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM example_data_output_summary ORDER BY day;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Covariance\n",
"Get covariance between temperature and humidity:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>covariance</th>\n",
" </tr>\n",
" <tr>\n",
" <td>Summary for 'Covariance' function<br>Output table = example_data_output<br>Producing covariance for columns: temperature,humidity<br>Total run time = ('example_data_output', 2, 0.1925361156463623)</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u\"Summary for 'Covariance' function\\nOutput table = example_data_output\\nProducing covariance for columns: temperature,humidity\\nTotal run time = ('example_data_output', 2, 0.1925361156463623)\",)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS example_data_output, example_data_output_summary;\n",
"SELECT madlib.covariance( 'example_data',\n",
" 'example_data_output',\n",
" 'temperature, humidity'\n",
" );"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View covariance matrix:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>column_position</th>\n",
" <th>variable</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>temperature</td>\n",
" <td>507.926664293</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>humidity</td>\n",
" <td>2.40227839089</td>\n",
" <td>307.35991456</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'temperature', 507.926664293343, None),\n",
" (2, u'humidity', 2.40227839088644, 307.359914560342)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM example_data_output ORDER BY column_position; "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the summary table:"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>method</th>\n",
" <th>source</th>\n",
" <th>output_table</th>\n",
" <th>column_names</th>\n",
" <th>mean_vector</th>\n",
" <th>total_rows_processed</th>\n",
" </tr>\n",
" <tr>\n",
" <td>Covariance</td>\n",
" <td>example_data</td>\n",
" <td>example_data_output</td>\n",
" <td>[u'temperature', u'humidity']</td>\n",
" <td>[70.188679245283, 79.8679245283019]</td>\n",
" <td>53</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'Covariance', u'example_data', u'example_data_output', [u'temperature', u'humidity'], [70.188679245283, 79.8679245283019], 53L)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM example_data_output_summary;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}