{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Author:马肖\n", "#### E-Mail:maxiaoscut@aliyun.com\n", "#### GitHub:https://github.com/Albertsr" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. 通过SVD自定义实现PCA" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from numpy import linalg as LA\n", "\n", "class PCA_SVD:\n", " # 参数n_components为保留的主成分数\n", " def __init__(self, matrix, n_components=None):\n", " self.matrix = matrix\n", " self.n_components = matrix.shape[1] if n_components==None else n_components\n", " \n", " # 自定义标准化方法\n", " def scale(self):\n", " def scale_vector(vector):\n", " delta = vector - np.mean(vector)\n", " std = np.std(vector, ddof=0)\n", " return delta / std\n", " matrix_scaled = np.apply_along_axis(arr=self.matrix, func1d=scale_vector, axis=0)\n", " return matrix_scaled\n", " \n", " # 对标准化后的矩阵进行奇异值分解 \n", " def matrix_svd(self):\n", " # 令A为m*n型矩阵,则U、V分别为m阶、n阶正交矩阵\n", " # U的每一个列向量都是A*A.T的特征向量,也称为左奇异向量\n", " # V的每一个行向量都是A.T*A的特征向量,也称为右奇异向量\n", " # sigma是由k个降序排列的奇异值构成的向量,其中k = min(matrix.shape)\n", " U, sigma, V = LA.svd(self.matrix) \n", " \n", " # 非零奇异值的个数不会超过原矩阵的秩,从而不会超过矩阵维度的最小值\n", " assert len(sigma) == min(self.matrix.shape)\n", " return U, sigma, V \n", " \n", " # 通过矩阵V进行PCA,返回最终降维后的矩阵\n", " def pca_result(self):\n", " sigma, V = self.matrix_svd()[1], self.matrix_svd()[2]\n", " \n", " # 奇异值的平方等于(A^T)*A的特征值\n", " eigen_values = np.square(sigma[:self.n_components]) / (self.matrix.shape[0]-1)\n", " \n", " # Q为投影矩阵,由V的前n_components个行向量转置后得到\n", " Q = V[:self.n_components, :].T\n", " \n", " # 计算标准化后的矩阵在Q上的投影,得到PCA的结果\n", " matrix_pca = np.dot(self.scale(), Q)\n", " # matrix_pca的列数应等于保留的主成分数\n", " assert matrix_pca.shape[1] == self.n_components\n", " return matrix_pca, eigen_values, Q.T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. 调用sklearn实现的PCA" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.datasets import load_wine\n", "\n", "X = load_wine().data\n", "row, col = X.shape\n", "scaler = StandardScaler()\n", "X_scaled = scaler.fit_transform(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. 验证结果表明:sklearn通过矩阵的奇异值分解实现PCA,而不是矩阵的特征分解" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def verify(n_components, dataset = X_scaled):\n", " # 返回sklearn的PCA结果\n", " pca_sklearn = PCA(n_components=n_components)\n", " sklearn_matrix = pca_sklearn.fit_transform(dataset)\n", " sklearn_eigenvalue = pca_sklearn.explained_variance_\n", " sklearn_eigenvector = pca_sklearn.components_\n", " \n", " # 返回SVD的PCA结果\n", " pca_custom = PCA_SVD(dataset, n_components=n_components)\n", " pca_custom_matrix, pca_custom_eigenvalue, pca_custom_eigenvector = pca_custom.pca_result()\n", " \n", " # 验证\n", " verify_eigenvalue = np.allclose(abs(sklearn_eigenvalue), abs(pca_custom_eigenvalue))\n", " verify_eigenvector = np.allclose(abs(sklearn_eigenvector), abs(pca_custom_eigenvector))\n", " verify_result = np.allclose(abs(sklearn_matrix), abs(pca_custom_matrix)) \n", " \n", " verify_bool = all([verify_eigenvalue, verify_eigenvector, verify_result])\n", " return verify_bool" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all(map(verify, range(1, col+1)))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }