{ "metadata": { "name": "PCA Test" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "plt.prism()\n", "from sklearn import datasets\n", "_,data,target,_,_ = datasets.load_iris().values()\n", "spfs = (4,4)\n", "s = 3" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.figure(figsize=spfs)\n", "for i in xrange(4):\n", " for j in xrange(4):\n", " plt.subplot(4,4,i*4+j+1)\n", " plt.scatter(data[:,i], data[:,j], alpha=.6,s=s)\n", " plt.xticks(())\n", " plt.yticks(())\n", "\n", "plt.suptitle(\"$X$\")\n", "plt.savefig(\"presentation/pca-pics/iris-all-nocolor.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.figure(figsize=spfs)\n", "for i in xrange(4):\n", " for j in xrange(4):\n", " plt.subplot(4,4,i*4+j+1)\n", " plt.scatter(data[:,i], data[:,j],c=target, alpha=.6,s=s)\n", " plt.xticks(())\n", " plt.yticks(())\n", "plt.suptitle(\"$X$, $Y$\")\n", "plt.savefig(\"presentation/pca-pics/iris-all.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.decomposition import PCA\n", "pca = PCA(n_components=2)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.figure(figsize=(3,3))\n", "data2 = pca.fit_transform(data) \n", "plt.scatter(data2[:,0], data2[:,1])\n", "plt.title(\"$X_{PCA}$\")\n", "plt.savefig(\"presentation/pca-pics/iris-2d-nocolor.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "data2 = pca.fit_transform(data) \n", "plt.scatter(data2[:,0], data2[:,1],c=target)\n", "plt.savefig(\"presentation/pca-pics/iris-2d.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print pca.components_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "data3 = pca.inverse_transform(data2)\n", "plt.figure(figsize=spfs)\n", "for i in xrange(4):\n", " for j in xrange(4):\n", " plt.subplot(4,4,i*4+j+1)\n", " plt.scatter(data3[:,i], data3[:,j],c=target,alpha=.6,s=s)\n", " plt.xticks(())\n", " plt.yticks(())\n", "plt.savefig(\"presentation/pca-pics/iris-bt.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "data3 = pca.inverse_transform(data2)\n", "plt.figure(figsize=spfs)\n", "for i in xrange(4):\n", " for j in xrange(4):\n", " plt.subplot(4,4,i*4+j+1)\n", " plt.scatter(data3[:,i], data3[:,j],alpha=.6,s=s)\n", " plt.xticks(())\n", " plt.yticks(())\n", "plt.suptitle(\"$X_{\\mathrm{clean}}$\")\n", "plt.savefig(\"presentation/pca-pics/iris-bt-nocolor.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cluster import KMeans\n", "kmeans = KMeans(3)\n", "labels = kmeans.fit(data).labels_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.scatter(data[:,0],data[:,1],c=labels)\n", "plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c=[0,1,2], s=100);\n", "plt.savefig(\"presentation/kmeans-pics/cluster-centers.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.scatter(data[:,0],data[:,1])\n", "plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=100);\n", "plt.savefig(\"presentation/kmeans-pics/cluster-centers-nocolor.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from matplotlib.patches import Ellipse\n", "def plotEllipse(pos,P,edge,face,line_width):\n", " U, s , Vh = svd(P)\n", " orient = math.atan2(U[1,0],U[0,0])*180/pi\n", " ellipsePlot = Ellipse(xy=pos, width=2.0*math.sqrt(s[0]), height=2.0*math.sqrt(s[1]), angle=orient,facecolor=face, edgecolor=edge, lw=line_width)\n", " ax = gca()\n", " ax.add_patch(ellipsePlot);\n", " return ellipsePlot;" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.figure(figsize=(3,3))\n", "plt.xlim(-4,6)\n", "plt.ylim(-4,6)\n", "Cov = [[2,-4],[-1,4]]\n", "print np.linalg.det(Cov)\n", "X = np.random.multivariate_normal([1,2], Cov, size=500)\n", "plt.scatter(X[:,0], X[:,1],s=2,alpha=0.7)\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.figsize(3,3)\n", "pca = PCA()\n", "pca.fit_transform(X)\n", "plt.xlim(-4,6)\n", "plt.ylim(-4,6)\n", "C = (pca.explained_variance_ * pca.components_.T).T\n", "mu = pca.mean_\n", "plotEllipse(mu, C.T, 'k', 'none', 2)\n", "plt.scatter(X[:,0], X[:,1], s=2, alpha=.2)\n", "plt.scatter([mu[0]],[mu[1]], s=40, c='red')\n", "\n", "# the point cloud is modeled as an ellipse\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-model.pdf\") \n", "C = (np.sqrt(pca.explained_variance_) * pca.components_.T).T\n", "a1 = plt.arrow(mu[0], mu[1], C[0,0], C[0,1], fc=\"b\", ec='b', head_width=.5, head_length=.5, width=.1, length_includes_head=True)\n", "\n", "# this is the direction with the maximum variance\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-vecs-1a.pdf\")\n", "\n", "a1.set_visible(False)\n", "\n", "plt.plot(\n", " [mu[0] - 10*C[0,0], mu[0] + 10*C[0,0]],\n", " [mu[1] - 10*C[0,1], mu[1] + 10*C[0,1]])\n", "\n", "L = []\n", "for i in xrange(10):\n", " d = np.dot(mu - X[i,:], pca.components_[1,:]) * pca.components_[1,:]\n", " L.append(plt.arrow(X[i,0], X[i,1], d[0], d[1], head_width=.1, head_length=.1))\n", " \n", "# we can project all points to this axis, and only use \"little\" information \n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-vecs-proj1.pdf\")\n", "\n", "map(lambda x: x.set_visible(False), L)\n", "plt.arrow(mu[0], mu[1], C[1,0], C[1,1], fc=\"b\", ec='b', head_width=.5, head_length=.5, width=.1, length_includes_head=True);\n", "\n", "# the next axis has less variance, and even less error\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-vecs-2a.pdf\")\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fs = (3,3)\n", "s = 2\n", "plt.figure(figsize=fs)\n", "pca = PCA()\n", "X3 = pca.fit_transform(X)\n", "plt.xlim(-4,6)\n", "plt.ylim(-4,6)\n", "C = (pca.explained_variance_ * pca.components_.T).T\n", "mu = pca.mean_\n", "plt.scatter(X[:,0], X[:,1], s=s, alpha=.5)\n", "plt.scatter([mu[0]],[mu[1]], s=40, c='red')\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-step1.pdf\")\n", "\n", "plt.figure(figsize=fs)\n", "X2 = X - mu\n", "plt.scatter(X2[:,0], X2[:,1], s=s, alpha=.5)\n", "plt.xlim(-4-mu[0],6-mu[0])\n", "plt.ylim(-4-mu[1],6-mu[1])\n", "plt.scatter([0],[0], s=40, c='red')\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-step2.pdf\")\n", "\n", "plt.figure(figsize=fs)\n", "plt.scatter(X2[:,0], X2[:,1], s=s, alpha=.5)\n", "plt.xlim(-4-mu[0],6-mu[0])\n", "plt.ylim(-4-mu[1],6-mu[1])\n", "plotEllipse([0,0], C.T, 'k', 'none', 2)\n", "plt.scatter([0],[0], s=40, c='red')\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-step3.pdf\")\n", "\n", "plt.figure(figsize=fs)\n", "plt.xlim(-4-mu[0],6-mu[0])\n", "plt.ylim(-4-mu[1],6-mu[1])\n", "plt.scatter(X3[:,0], X3[:,1], s=4, alpha=.5)\n", "plotEllipse([0,0], C.T, 'gray', 'none', 2)\n", "plotEllipse([0,0], np.cov(X3, rowvar=False).T, 'k', 'none', 2)\n", "plt.scatter([0],[0], s=40, c='red')\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-step4.pdf\")\n", "\n", "plt.figure(figsize=fs)\n", "plt.xlim(-4-mu[0],6-mu[0])\n", "plt.ylim(-4-mu[1],6-mu[1])\n", "plt.scatter(X3[:,0], X3[:,1], s=s, alpha=.2)\n", "plotEllipse([0,0], np.cov(X3, rowvar=False).T, 'k', 'none', 2)\n", "plt.scatter([0],[0], s=40, c='red')\n", "\n", "for i in xrange(10):\n", " d = np.dot(- X3[i,:], [0,1]) * np.array([0,1])\n", " plt.arrow(X3[i,0], X3[i,1], d[0], d[1], head_width=.1, head_length=.1)\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-step5.pdf\")\n", "\n", "\n", "plt.figure(figsize=fs)\n", "plt.xlim(-4-mu[0],6-mu[0])\n", "plt.ylim(-4-mu[1],6-mu[1])\n", "plt.scatter(X3[:,0], X3[:,1], s=s, alpha=.2)\n", "plotEllipse([0,0], np.cov(X3, rowvar=False).T, 'k', 'none', 2)\n", "plt.scatter([0],[0], s=40, c='red')\n", "\n", "for i in xrange(10):\n", " d = np.dot(- X3[i,:], [1,0]) * np.array([1,0])\n", " plt.arrow(X3[i,0], X3[i,1], d[0], d[1], head_width=.1, head_length=.1)\n", "plt.savefig(\"presentation/pca-pics/pointcloud-2d-step6.pdf\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }