community-artifacts/Unsupervised-learning/PCA-project-v1.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Principle component analysis projection (MADlib v1.10+)\n", "Demonstrates PCA projection including grouping, which was introduced in MADlib v1.10" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", " \"You should import from traitlets.config instead.\", ShimWarning)\n", "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" ] } ], "source": [ "%load_ext sql" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "u'Connected: gpdbchina@madlib'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n", "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", "#%sql postgresql://gpadmin@54.197.30.46:10432/gpadmin" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>version</th>\n", " </tr>\n", " <tr>\n", " <td>MADlib version: 1.10.0-dev, git revision: rc/v1.9alpha-rc1-91-ga0325d6, cmake configuration time: Wed Jan 11 01:36:53 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(u'MADlib version: 1.10.0-dev, git revision: rc/v1.9alpha-rc1-91-ga0325d6, cmake configuration time: Wed Jan 11 01:36:53 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql select madlib.version();\n", "#%sql select version();" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Create sample data in dense matrix form" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "3 rows affected.\n", "3 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>row_vec</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[1.0, 2.0, 3.0]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[2.0, 1.0, 2.0]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[3.0, 2.0, 1.0]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [1.0, 2.0, 3.0]), (2, [2.0, 1.0, 2.0]), (3, [3.0, 2.0, 1.0])]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql \n", "DROP TABLE IF EXISTS mat;\n", "\n", "CREATE TABLE mat (\n", " id integer,\n", " row_vec double precision[]\n", ");\n", "\n", "INSERT INTO mat VALUES\n", "(1, '{1,2,3}'),\n", "(2, '{2,1,2}'),\n", "(3, '{3,2,1}');\n", "\n", "SELECT * FROM mat ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Run the PCA function for a specified number of principal components" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "2 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>principal_components</th>\n", " <th>std_dev</th>\n", " <th>proportion</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[0.707106781186547, -4.16333634234434e-17, -0.707106781186547]</td>\n", " <td>1.41421356237</td>\n", " <td>0.857142857142</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[-2.77555756156289e-17, -1.0, -4.16333634234434e-17]</td>\n", " <td>0.57735026919</td>\n", " <td>0.142857142857</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [0.707106781186547, -4.16333634234434e-17, -0.707106781186547], 1.41421356237309, 0.857142857142244),\n", " (2, [-2.77555756156289e-17, -1.0, -4.16333634234434e-17], 0.577350269189626, 0.142857142857041)]" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS result_table, result_table_mean;\n", "\n", "SELECT madlib.pca_train('mat', -- Source table\n", " 'result_table', -- Output table\n", " 'id', -- Row id of source table\n", " 2); -- Number of principle components\n", "\n", "SELECT * FROM result_table ORDER BY row_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Projection in dense matrix form\n", "Project the original data to a lower dimensional representation, and view the result of the projection:" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "3 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>row_vec</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[-1.41421356237309, -0.33333333333]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[2.77555756157677e-17, 0.66666666667]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[1.41421356237309, -0.33333333333]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [-1.41421356237309, -0.33333333333]),\n", " (2, [2.77555756157677e-17, 0.66666666667]),\n", " (3, [1.41421356237309, -0.33333333333])]" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS residual_table, result_summary_table, out_table;\n", "SELECT madlib.pca_project( 'mat',\n", " 'result_table',\n", " 'out_table',\n", " 'id',\n", " 'residual_table',\n", " 'result_summary_table'\n", " );" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check the error in the projection:" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>exec_time</th>\n", " <th>residual_norm</th>\n", " <th>relative_residual_norm</th>\n", " </tr>\n", " <tr>\n", " <td>344.725847244</td>\n", " <td>3.92699236582</td>\n", " <td>0.645593568096</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(344.725847244, 3.92699236582, 0.645593568096)]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM result_summary_table;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check the residuals:" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>row_vec</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[0.0253387670918417, 0.346033388731942, -1.18096378457015]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[0.0397264537778188, 0.542515718038531, -1.85153062226574]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[0.0656887038551082, 0.897063567221092, -3.06155307505916]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [0.0253387670918417, 0.346033388731942, -1.18096378457015]),\n", " (2, [0.0397264537778188, 0.542515718038531, -1.85153062226574]),\n", " (3, [0.0656887038551082, 0.897063567221092, -3.06155307505916])]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT * FROM residual_table ORDER BY row_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Grouping with dense matrices\n", "Create sample data in dense matrix form with a grouping column:" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "6 rows affected.\n", "6 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>id</th>\n", " <th>row_vec</th>\n", " <th>matrix_id</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[1.0, 2.0, 3.0]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[2.0, 1.0, 2.0]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[3.0, 2.0, 1.0]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>[1.0, 2.0, 3.0, 4.0, 5.0]</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>[2.0, 5.0, 2.0, 4.0, 1.0]</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>[5.0, 4.0, 3.0, 2.0, 1.0]</td>\n", " <td>2</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [1.0, 2.0, 3.0], 1),\n", " (2, [2.0, 1.0, 2.0], 1),\n", " (3, [3.0, 2.0, 1.0], 1),\n", " (4, [1.0, 2.0, 3.0, 4.0, 5.0], 2),\n", " (5, [2.0, 5.0, 2.0, 4.0, 1.0], 2),\n", " (6, [5.0, 4.0, 3.0, 2.0, 1.0], 2)]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql \n", "DROP TABLE IF EXISTS mat_group;\n", "\n", "CREATE TABLE mat_group (\n", " id integer,\n", " row_vec double precision[],\n", " matrix_id integer\n", ");\n", "\n", "INSERT INTO mat_group VALUES\n", "(1, '{1,2,3}', 1),\n", "(2, '{2,1,2}', 1),\n", "(3, '{3,2,1}', 1),\n", "(4, '{1,2,3,4,5}' ,2),\n", "(5, '{2,5,2,4,1}' ,2),\n", "(6, '{5,4,3,2,1}' ,2);\n", "\n", "SELECT * FROM mat_group ORDER BY id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Run the PCA function with grouping for a specified proportion of variance" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "3 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>principal_components</th>\n", " <th>std_dev</th>\n", " <th>proportion</th>\n", " <th>matrix_id</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[-0.707106781186548, -1.11022302462516e-16, 0.707106781186548]</td>\n", " <td>1.41421356237</td>\n", " <td>0.857142857142</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[0.555378486712784, 0.388303582074091, -0.0442457354870797, -0.255566375612852, -0.688115693174023]</td>\n", " <td>3.23152203117</td>\n", " <td>0.764102534485</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[0.587384101786277, -0.485138064894743, 0.311532046315153, -0.449458074050715, 0.347212037159181]</td>\n", " <td>1.79553112719</td>\n", " <td>0.235897465516</td>\n", " <td>2</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [-0.707106781186548, -1.11022302462516e-16, 0.707106781186548], 1.41421356237309, 0.857142857142245, 1),\n", " (1, [0.555378486712784, 0.388303582074091, -0.0442457354870797, -0.255566375612852, -0.688115693174023], 3.2315220311722, 0.764102534485172, 2),\n", " (2, [0.587384101786277, -0.485138064894743, 0.311532046315153, -0.449458074050715, 0.347212037159181], 1.795531127192, 0.235897465516047, 2)]" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS result_table_group, result_table_group_mean;\n", "\n", "SELECT madlib.pca_train('mat_group', -- Source table\n", " 'result_table_group', -- Output table\n", " 'id', -- Row id of source table\n", " 0.8, -- Proportion of variance\n", " 'matrix_id'); -- Grouping column\n", "\n", "SELECT * FROM result_table_group ORDER BY matrix_id, row_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Projection in dense matrix form with grouping \n", "Run the PCA project function on subsets of an input table based on grouping columns. Note that the parameter 'pc_table' used for projection must also be generated in training using the same grouping columns. " ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "6 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>row_vec</th>\n", " <th>matrix_id</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[1.4142135623731]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[7.40148683087139e-17]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[-1.4142135623731]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>[-3.59290479201926, 0.559694003674779]</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>5</td>\n", " <td>[0.924092949098971, -2.00871628417505]</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>6</td>\n", " <td>[2.66881184290186, 1.44902228049511]</td>\n", " <td>2</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [1.4142135623731], 1),\n", " (2, [7.40148683087139e-17], 1),\n", " (3, [-1.4142135623731], 1),\n", " (4, [-3.59290479201926, 0.559694003674779], 2),\n", " (5, [0.924092949098971, -2.00871628417505], 2),\n", " (6, [2.66881184290186, 1.44902228049511], 2)]" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql \n", "DROP TABLE IF EXISTS mat_group_projected;\n", "SELECT madlib.pca_project('mat_group',\n", " 'result_table_group',\n", " 'mat_group_projected',\n", " 'id');\n", "SELECT * FROM mat_group_projected ORDER BY matrix_id, row_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. Sparse matrices\n", "Create sample data in sparse matrix form: " ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "8 rows affected.\n", "8 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>col_id</th>\n", " <th>value</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>5</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>6.0</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>2</td>\n", " <td>7.0</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>3</td>\n", " <td>3.0</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>8.0</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>4</td>\n", " <td>4.0</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, 1, 1.0),\n", " (1, 5, 5.0),\n", " (2, 2, 2.0),\n", " (2, 4, 6.0),\n", " (3, 2, 7.0),\n", " (3, 3, 3.0),\n", " (4, 3, 8.0),\n", " (4, 4, 4.0)]" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS mat_sparse;\n", "\n", "CREATE TABLE mat_sparse (\n", " row_id integer,\n", " col_id integer,\n", " value double precision\n", ");\n", "\n", "INSERT INTO mat_sparse VALUES\n", "(1, 1, 1.0),\n", "(2, 2, 2.0),\n", "(3, 3, 3.0),\n", "(4, 4, 4.0),\n", "(1, 5, 5.0),\n", "(2, 4, 6.0),\n", "(3, 2, 7.0),\n", "(4, 3, 8.0);\n", "\n", "SELECT * FROM mat_sparse ORDER BY row_id, col_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is what the sparse matrix looks like when put in dense form: " ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "4 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>value</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[1.0, 0.0, 0.0, 0.0, 5.0]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[0.0, 2.0, 0.0, 6.0, 0.0]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[0.0, 7.0, 3.0, 0.0, 0.0]</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>[0.0, 0.0, 8.0, 4.0, 0.0]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [1.0, 0.0, 0.0, 0.0, 5.0]),\n", " (2, [0.0, 2.0, 0.0, 6.0, 0.0]),\n", " (3, [0.0, 7.0, 3.0, 0.0, 0.0]),\n", " (4, [0.0, 0.0, 8.0, 4.0, 0.0])]" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS mat_dense;\n", "\n", "SELECT madlib.matrix_densify('mat_sparse', \n", " 'row=row_id, col=col_id, val=value', \n", " 'mat_dense');\n", "\n", "SELECT * FROM mat_dense ORDER BY row_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7. Run the PCA sparse function for a specified number of principal components" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "3 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>principal_components</th>\n", " <th>std_dev</th>\n", " <th>proportion</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[0.0876046030186159, 0.0968983772909993, -0.798223884258748, -0.392314423461549, 0.438023015093079]</td>\n", " <td>4.2136280383</td>\n", " <td>0.436590030617</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[-0.0647272661608605, 0.877639526308691, 0.0872109579144957, -0.336471420895091, -0.323636330804302]</td>\n", " <td>3.68408023747</td>\n", " <td>0.333748701545</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[0.0780380267884856, -0.177956517174911, 0.550872495873674, -0.711711105037595, 0.390190133942428]</td>\n", " <td>3.0560690806</td>\n", " <td>0.229661267838</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [0.0876046030186159, 0.0968983772909993, -0.798223884258748, -0.392314423461549, 0.438023015093079], 4.21362803829554, 0.436590030617467),\n", " (2, [-0.0647272661608605, 0.877639526308691, 0.0872109579144957, -0.336471420895091, -0.323636330804302], 3.68408023747461, 0.333748701544697),\n", " (3, [0.0780380267884856, -0.177956517174911, 0.550872495873674, -0.711711105037595, 0.390190133942428], 3.05606908060098, 0.229661267837835)]" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS result_table, result_table_mean;\n", "\n", "SELECT madlib.pca_sparse_train( 'mat_sparse', -- Source table\n", " 'result_table', -- Output table\n", " 'row_id', -- Row id of source table\n", " 'col_id', -- Column id of source table\n", " 'value', -- Value of matrix at row_id, col_id\n", " 4, -- Actual number of rows in the matrix\n", " 5, -- Actual number of columns in the matrix\n", " 3); -- Number of principle components\n", " \n", "SELECT * FROM result_table ORDER BY row_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 8. Projection in sparse matrix form\n", "Project the original sparse data to low-dimensional representation:" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "4 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>row_vec</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[4.66617015032369, -2.63552220635847, 2.1865220849604]</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[0.228360685652383, -1.21616275892926, -4.46864627611561]</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[0.672067460100428, 5.45249627172823, 0.56445525585642]</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>[-5.5665982960765, -1.6008113064405, 1.71766893529879]</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [4.66617015032369, -2.63552220635847, 2.1865220849604]),\n", " (2, [0.228360685652383, -1.21616275892926, -4.46864627611561]),\n", " (3, [0.672067460100428, 5.45249627172823, 0.56445525585642]),\n", " (4, [-5.5665982960765, -1.6008113064405, 1.71766893529879])]" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS mat_sparse_out;\n", "\n", "SELECT madlib.pca_sparse_project(\n", " 'mat_sparse',\n", " 'result_table',\n", " 'mat_sparse_out',\n", " 'row_id',\n", " 'col_id',\n", " 'value',\n", " 4,\n", " 5\n", " );\n", "\n", "SELECT * FROM mat_sparse_out ORDER BY row_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 8. Grouping with sparse matrices\n", "Create sample data in sparse matrix form:" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "8 rows affected.\n", "8 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>col_id</th>\n", " <th>value</th>\n", " <th>matrix_id</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>5</td>\n", " <td>5.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>2.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>3</td>\n", " <td>3.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>4</td>\n", " <td>4.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>6.0</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>2</td>\n", " <td>7.0</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>8.0</td>\n", " <td>2</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, 1, 1.0, 1),\n", " (1, 5, 5.0, 1),\n", " (2, 2, 2.0, 1),\n", " (3, 3, 3.0, 1),\n", " (4, 4, 4.0, 1),\n", " (2, 4, 6.0, 2),\n", " (3, 2, 7.0, 2),\n", " (4, 3, 8.0, 2)]" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS mat_sparse_group;\n", "\n", "CREATE TABLE mat_sparse_group (\n", " row_id integer,\n", " col_id integer,\n", " value double precision,\n", " matrix_id integer);\n", "\n", "INSERT INTO mat_sparse_group VALUES\n", "(1, 1, 1.0, 1),\n", "(2, 2, 2.0, 1),\n", "(3, 3, 3.0, 1),\n", "(4, 4, 4.0, 1),\n", "(1, 5, 5.0, 1),\n", "(2, 4, 6.0, 2),\n", "(3, 2, 7.0, 2),\n", "(4, 3, 8.0, 2);\n", "\n", "SELECT * FROM mat_sparse_group ORDER BY matrix_id, row_id, col_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run the PCA function with grouping for a specified proportion of variance" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "4 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>principal_components</th>\n", " <th>std_dev</th>\n", " <th>proportion</th>\n", " <th>matrix_id</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[-0.17805696611353, 0.0681313257646982, 0.140140679619953, 0.389110973431455, -0.890284830567652]</td>\n", " <td>2.73659933166</td>\n", " <td>0.544652792875</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[-0.0492086814863994, 0.149371585357526, 0.523758898591052, -0.800253642890858, -0.246043407431997]</td>\n", " <td>2.06058314533</td>\n", " <td>0.308800210824</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[0.0, -0.479486114660443, 0.860935716335885, -0.169949281213684, 0.0]</td>\n", " <td>4.40325305088</td>\n", " <td>0.520500333693</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[0.0, 0.689230898585949, 0.249586247854404, -0.680196643123418, 0.0]</td>\n", " <td>3.74355664586</td>\n", " <td>0.376220573443</td>\n", " <td>2</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [-0.17805696611353, 0.0681313257646982, 0.140140679619953, 0.389110973431455, -0.890284830567652], 2.73659933165925, 0.544652792875482, 1),\n", " (2, [-0.0492086814863994, 0.149371585357526, 0.523758898591052, -0.800253642890858, -0.246043407431997], 2.06058314533194, 0.308800210823714, 1),\n", " (1, [0.0, -0.479486114660443, 0.860935716335885, -0.169949281213684, 0.0], 4.40325305087975, 0.520500333693473, 2),\n", " (2, [0.0, 0.689230898585949, 0.249586247854404, -0.680196643123418, 0.0], 3.7435566458567, 0.376220573442628, 2)]" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS result_table_group, result_table_group_mean;\n", "\n", "SELECT madlib.pca_sparse_train( 'mat_sparse_group', -- Source table\n", " 'result_table_group', -- Output table\n", " 'row_id', -- Row id of source table\n", " 'col_id', -- Column id of source table\n", " 'value', -- Value of matrix at row_id, col_id\n", " 4, -- Actual number of rows in the matrix\n", " 5, -- Actual number of columns in the matrix\n", " 0.8, -- Proportion of variance\n", " 'matrix_id');\n", " \n", "SELECT * FROM result_table_group ORDER BY matrix_id, row_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 9. Projection in sparse format with grouping" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "1 rows affected.\n", "8 rows affected.\n" ] }, { "data": { "text/html": [ "<table>\n", " <tr>\n", " <th>row_id</th>\n", " <th>row_vec</th>\n", " <th>matrix_id</th>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[-4.00039298524261, -0.626820612715982]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[0.765350785238575, 0.951348276645455]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[1.04951017256904, 2.22388180170356]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>[2.185532027435, -2.54840946563303]</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>[-0.627846810195469, -0.685031603549092]</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>[-1.64754249747757, -4.7662114622896]</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>[-3.98424961281857, 4.13958468655255]</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>[6.25963892049161, 1.31165837928614]</td>\n", " <td>2</td>\n", " </tr>\n", "</table>" ], "text/plain": [ "[(1, [-4.00039298524261, -0.626820612715982], 1),\n", " (2, [0.765350785238575, 0.951348276645455], 1),\n", " (3, [1.04951017256904, 2.22388180170356], 1),\n", " (4, [2.185532027435, -2.54840946563303], 1),\n", " (1, [-0.627846810195469, -0.685031603549092], 2),\n", " (2, [-1.64754249747757, -4.7662114622896], 2),\n", " (3, [-3.98424961281857, 4.13958468655255], 2),\n", " (4, [6.25963892049161, 1.31165837928614], 2)]" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "DROP TABLE IF EXISTS mat_sparse_group_projected;\n", "SELECT madlib.pca_sparse_project(\n", " 'mat_sparse_group',\n", " 'result_table_group',\n", " 'mat_sparse_group_projected',\n", " 'row_id',\n", " 'col_id',\n", " 'value',\n", " 4,\n", " 5\n", " );\n", "\n", "SELECT * FROM mat_sparse_group_projected ORDER BY matrix_id, row_id;" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 0 }

community-artifacts/Unsupervised-learning/PCA-project-v1.ipynb (1,381 lines of code) (raw):