community-artifacts/Unsupervised-learning/PCA-train-v1.ipynb (991 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Principle component analysis training (MADlib v1.10+)\n",
"Demonstrates PCA training including grouping, which was introduced in MADlib v1.10"
]
},
{
"cell_type": "code",
"execution_count": 211,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The sql extension is already loaded. To reload it, use:\n",
" %reload_ext sql\n"
]
}
],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 212,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u'Connected: gpdbchina@madlib'"
]
},
"execution_count": 212,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
"#%sql postgresql://gpadmin@54.197.30.46:10432/gpadmin"
]
},
{
"cell_type": "code",
"execution_count": 213,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.10.0-dev, git revision: rc/v1.9alpha-rc1-91-ga0325d6, cmake configuration time: Wed Jan 11 01:36:53 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.10.0-dev, git revision: rc/v1.9alpha-rc1-91-ga0325d6, cmake configuration time: Wed Jan 11 01:36:53 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
]
},
"execution_count": 213,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Create sample data in dense matrix form"
]
},
{
"cell_type": "code",
"execution_count": 284,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"3 rows affected.\n",
"3 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>row_vec</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[1.0, 2.0, 3.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[2.0, 1.0, 2.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>[3.0, 2.0, 1.0]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [1.0, 2.0, 3.0]), (2, [2.0, 1.0, 2.0]), (3, [3.0, 2.0, 1.0])]"
]
},
"execution_count": 284,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"DROP TABLE IF EXISTS mat;\n",
"\n",
"CREATE TABLE mat (\n",
" id integer,\n",
" row_vec double precision[]\n",
");\n",
"\n",
"INSERT INTO mat VALUES\n",
"(1, '{1,2,3}'),\n",
"(2, '{2,1,2}'),\n",
"(3, '{3,2,1}');\n",
"\n",
"SELECT * FROM mat ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Run the PCA function for a specified number of principal components"
]
},
{
"cell_type": "code",
"execution_count": 285,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>row_id</th>\n",
" <th>principal_components</th>\n",
" <th>std_dev</th>\n",
" <th>proportion</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[0.707106781186547, 0.0, -0.707106781186547]</td>\n",
" <td>1.41421356237</td>\n",
" <td>0.857142857142</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[0.0, -1.0, 0.0]</td>\n",
" <td>0.57735026919</td>\n",
" <td>0.142857142857</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [0.707106781186547, 0.0, -0.707106781186547], 1.41421356237309, 0.857142857142244),\n",
" (2, [0.0, -1.0, 0.0], 0.577350269189626, 0.142857142857041)]"
]
},
"execution_count": 285,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS result_table, result_table_mean;\n",
"\n",
"SELECT madlib.pca_train('mat', -- Source table\n",
" 'result_table', -- Output table\n",
" 'id', -- Row id of source table\n",
" 2); -- Number of principle components\n",
"\n",
"SELECT * FROM result_table ORDER BY row_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Run the PCA function for a specified proportion of variance"
]
},
{
"cell_type": "code",
"execution_count": 286,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>row_id</th>\n",
" <th>principal_components</th>\n",
" <th>std_dev</th>\n",
" <th>proportion</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[-0.707106781186547, -5.55111512312578e-17, 0.707106781186547]</td>\n",
" <td>1.41421356237</td>\n",
" <td>0.857142857142</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[-5.55111512312578e-17, -1.0, -4.16333634234434e-17]</td>\n",
" <td>0.57735026919</td>\n",
" <td>0.142857142857</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [-0.707106781186547, -5.55111512312578e-17, 0.707106781186547], 1.41421356237309, 0.857142857142244),\n",
" (2, [-5.55111512312578e-17, -1.0, -4.16333634234434e-17], 0.577350269189626, 0.142857142857041)]"
]
},
"execution_count": 286,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS result_table, result_table_mean;\n",
"SELECT madlib.pca_train('mat', -- Source table\n",
" 'result_table', -- Output table\n",
" 'id', -- Row id of source table\n",
" 0.9); -- Proportion of variance\n",
"SELECT * FROM result_table ORDER BY row_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Grouping with dense matrices\n",
"Create sample data in dense matrix form with a grouping column:"
]
},
{
"cell_type": "code",
"execution_count": 287,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"6 rows affected.\n",
"6 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>row_vec</th>\n",
" <th>matrix_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[1.0, 2.0, 3.0]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[2.0, 1.0, 2.0]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>[3.0, 2.0, 1.0]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>[1.0, 2.0, 3.0, 4.0, 5.0]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>[2.0, 5.0, 2.0, 4.0, 1.0]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>[5.0, 4.0, 3.0, 2.0, 1.0]</td>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [1.0, 2.0, 3.0], 1),\n",
" (2, [2.0, 1.0, 2.0], 1),\n",
" (3, [3.0, 2.0, 1.0], 1),\n",
" (4, [1.0, 2.0, 3.0, 4.0, 5.0], 2),\n",
" (5, [2.0, 5.0, 2.0, 4.0, 1.0], 2),\n",
" (6, [5.0, 4.0, 3.0, 2.0, 1.0], 2)]"
]
},
"execution_count": 287,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"DROP TABLE IF EXISTS mat_group;\n",
"\n",
"CREATE TABLE mat_group (\n",
" id integer,\n",
" row_vec double precision[],\n",
" matrix_id integer\n",
");\n",
"\n",
"INSERT INTO mat_group VALUES\n",
"(1, '{1,2,3}', 1),\n",
"(2, '{2,1,2}', 1),\n",
"(3, '{3,2,1}', 1),\n",
"(4, '{1,2,3,4,5}' ,2),\n",
"(5, '{2,5,2,4,1}' ,2),\n",
"(6, '{5,4,3,2,1}' ,2);\n",
"\n",
"SELECT * FROM mat_group ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5.0 Run the PCA function with grouping for a specified proportion of variance"
]
},
{
"cell_type": "code",
"execution_count": 289,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"3 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>row_id</th>\n",
" <th>principal_components</th>\n",
" <th>std_dev</th>\n",
" <th>proportion</th>\n",
" <th>matrix_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[0.707106781186548, 0.0, -0.707106781186547]</td>\n",
" <td>1.41421356237</td>\n",
" <td>0.857142857142</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[-0.555378486712784, -0.388303582074091, 0.0442457354870796, 0.255566375612852, 0.688115693174023]</td>\n",
" <td>3.23152203117</td>\n",
" <td>0.764102534485</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[0.587384101786277, -0.485138064894743, 0.311532046315153, -0.449458074050715, 0.347212037159181]</td>\n",
" <td>1.79553112719</td>\n",
" <td>0.235897465516</td>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [0.707106781186548, 0.0, -0.707106781186547], 1.4142135623731, 0.857142857142245, 1),\n",
" (1, [-0.555378486712784, -0.388303582074091, 0.0442457354870796, 0.255566375612852, 0.688115693174023], 3.2315220311722, 0.764102534485173, 2),\n",
" (2, [0.587384101786277, -0.485138064894743, 0.311532046315153, -0.449458074050715, 0.347212037159181], 1.795531127192, 0.235897465516047, 2)]"
]
},
"execution_count": 289,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS result_table_group, result_table_group_mean;\n",
"\n",
"SELECT madlib.pca_train('mat_group', -- Source table\n",
" 'result_table_group', -- Output table\n",
" 'id', -- Row id of source table\n",
" 0.8, -- Proportion of variance\n",
" 'matrix_id'); -- Grouping column\n",
"\n",
"SELECT * FROM result_table_group ORDER BY matrix_id, row_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6. Sparse matrices\n",
"Create sample data in sparse matrix form: "
]
},
{
"cell_type": "code",
"execution_count": 266,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"8 rows affected.\n",
"8 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>row_id</th>\n",
" <th>col_id</th>\n",
" <th>value</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>6.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>7.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>8.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 1, 1.0),\n",
" (1, 5, 5.0),\n",
" (2, 2, 2.0),\n",
" (2, 4, 6.0),\n",
" (3, 2, 7.0),\n",
" (3, 3, 3.0),\n",
" (4, 3, 8.0),\n",
" (4, 4, 4.0)]"
]
},
"execution_count": 266,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS mat_sparse;\n",
"\n",
"CREATE TABLE mat_sparse (\n",
" row_id integer,\n",
" col_id integer,\n",
" value double precision\n",
");\n",
"\n",
"INSERT INTO mat_sparse VALUES\n",
"(1, 1, 1.0),\n",
"(2, 2, 2.0),\n",
"(3, 3, 3.0),\n",
"(4, 4, 4.0),\n",
"(1, 5, 5.0),\n",
"(2, 4, 6.0),\n",
"(3, 2, 7.0),\n",
"(4, 3, 8.0);\n",
"\n",
"SELECT * FROM mat_sparse ORDER BY row_id, col_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is what the sparse matrix looks like when put in dense form: "
]
},
{
"cell_type": "code",
"execution_count": 290,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"4 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>row_id</th>\n",
" <th>value</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[1.0, 0.0, 0.0, 0.0, 5.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[0.0, 2.0, 0.0, 6.0, 0.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>[0.0, 7.0, 3.0, 0.0, 0.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>[0.0, 0.0, 8.0, 4.0, 0.0]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [1.0, 0.0, 0.0, 0.0, 5.0]),\n",
" (2, [0.0, 2.0, 0.0, 6.0, 0.0]),\n",
" (3, [0.0, 7.0, 3.0, 0.0, 0.0]),\n",
" (4, [0.0, 0.0, 8.0, 4.0, 0.0])]"
]
},
"execution_count": 290,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS mat_dense;\n",
"\n",
"SELECT madlib.matrix_densify('mat_sparse', \n",
" 'row=row_id, col=col_id, val=value', \n",
" 'mat_dense');\n",
"\n",
"SELECT * FROM mat_dense ORDER BY row_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 7. Run the PCA sparse function for a specified number of principal components"
]
},
{
"cell_type": "code",
"execution_count": 291,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"3 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>row_id</th>\n",
" <th>principal_components</th>\n",
" <th>std_dev</th>\n",
" <th>proportion</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[-0.0876046030186158, -0.0968983772909994, 0.798223884258747, 0.392314423461549, -0.438023015093079]</td>\n",
" <td>4.2136280383</td>\n",
" <td>0.436590030617</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[-0.0647272661608605, 0.877639526308692, 0.0872109579144954, -0.33647142089509, -0.323636330804303]</td>\n",
" <td>3.68408023747</td>\n",
" <td>0.333748701545</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>[-0.0780380267884855, 0.177956517174911, -0.550872495873674, 0.711711105037595, -0.390190133942428]</td>\n",
" <td>3.0560690806</td>\n",
" <td>0.229661267838</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [-0.0876046030186158, -0.0968983772909994, 0.798223884258747, 0.392314423461549, -0.438023015093079], 4.21362803829554, 0.436590030617467),\n",
" (2, [-0.0647272661608605, 0.877639526308692, 0.0872109579144954, -0.33647142089509, -0.323636330804303], 3.68408023747461, 0.333748701544697),\n",
" (3, [-0.0780380267884855, 0.177956517174911, -0.550872495873674, 0.711711105037595, -0.390190133942428], 3.05606908060098, 0.229661267837836)]"
]
},
"execution_count": 291,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS result_table, result_table_mean;\n",
"\n",
"SELECT madlib.pca_sparse_train( 'mat_sparse', -- Source table\n",
" 'result_table', -- Output table\n",
" 'row_id', -- Row id of source table\n",
" 'col_id', -- Column id of source table\n",
" 'value', -- Value of matrix at row_id, col_id\n",
" 4, -- Actual number of rows in the matrix\n",
" 5, -- Actual number of columns in the matrix\n",
" 3); -- Number of principle components\n",
" \n",
"SELECT * FROM result_table ORDER BY row_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 8. Grouping with sparse matrices\n",
"Create sample data in sparse matrix form:"
]
},
{
"cell_type": "code",
"execution_count": 294,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"8 rows affected.\n",
"8 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>row_id</th>\n",
" <th>col_id</th>\n",
" <th>value</th>\n",
" <th>matrix_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>5.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>6.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>7.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>8.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 1, 1.0, 1),\n",
" (1, 5, 5.0, 1),\n",
" (2, 2, 2.0, 1),\n",
" (3, 3, 3.0, 1),\n",
" (4, 4, 4.0, 1),\n",
" (2, 4, 6.0, 2),\n",
" (3, 2, 7.0, 2),\n",
" (4, 3, 8.0, 2)]"
]
},
"execution_count": 294,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS mat_sparse_group;\n",
"\n",
"CREATE TABLE mat_sparse_group (\n",
" row_id integer,\n",
" col_id integer,\n",
" value double precision,\n",
" matrix_id integer);\n",
"\n",
"INSERT INTO mat_sparse_group VALUES\n",
"(1, 1, 1.0, 1),\n",
"(2, 2, 2.0, 1),\n",
"(3, 3, 3.0, 1),\n",
"(4, 4, 4.0, 1),\n",
"(1, 5, 5.0, 1),\n",
"(2, 4, 6.0, 2),\n",
"(3, 2, 7.0, 2),\n",
"(4, 3, 8.0, 2);\n",
"\n",
"SELECT * FROM mat_sparse_group ORDER BY matrix_id, row_id, col_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run the PCA function with grouping for a specified proportion of variance"
]
},
{
"cell_type": "code",
"execution_count": 295,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"4 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>row_id</th>\n",
" <th>principal_components</th>\n",
" <th>std_dev</th>\n",
" <th>proportion</th>\n",
" <th>matrix_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[-0.17805696611353, 0.0681313257646983, 0.140140679619953, 0.389110973431454, -0.890284830567652]</td>\n",
" <td>2.73659933166</td>\n",
" <td>0.544652792875</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[-0.0492086814863993, 0.149371585357526, 0.523758898591052, -0.800253642890858, -0.246043407431996]</td>\n",
" <td>2.06058314533</td>\n",
" <td>0.308800210824</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[0.0, -0.479486114660443, 0.860935716335886, -0.169949281213684, 0.0]</td>\n",
" <td>4.40325305088</td>\n",
" <td>0.520500333693</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[0.0, 0.689230898585949, 0.249586247854403, -0.680196643123417, 0.0]</td>\n",
" <td>3.74355664586</td>\n",
" <td>0.376220573443</td>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [-0.17805696611353, 0.0681313257646983, 0.140140679619953, 0.389110973431454, -0.890284830567652], 2.73659933165925, 0.544652792875481, 1),\n",
" (2, [-0.0492086814863993, 0.149371585357526, 0.523758898591052, -0.800253642890858, -0.246043407431996], 2.06058314533194, 0.308800210823714, 1),\n",
" (1, [0.0, -0.479486114660443, 0.860935716335886, -0.169949281213684, 0.0], 4.40325305087975, 0.520500333693473, 2),\n",
" (2, [0.0, 0.689230898585949, 0.249586247854403, -0.680196643123417, 0.0], 3.7435566458567, 0.376220573442628, 2)]"
]
},
"execution_count": 295,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS result_table_group, result_table_group_mean;\n",
"\n",
"SELECT madlib.pca_sparse_train( 'mat_sparse_group', -- Source table\n",
" 'result_table_group', -- Output table\n",
" 'row_id', -- Row id of source table\n",
" 'col_id', -- Column id of source table\n",
" 'value', -- Value of matrix at row_id, col_id\n",
" 4, -- Actual number of rows in the matrix\n",
" 5, -- Actual number of columns in the matrix\n",
" 0.8, -- Proportion of variance\n",
" 'matrix_id');\n",
" \n",
"SELECT * FROM result_table_group ORDER BY matrix_id, row_id;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}