{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Author：马肖\n",
    "#### E-Mail：maxiaoscut@aliyun.com\n",
    "#### GitHub：https://github.com/Albertsr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 通过SVD自定义实现PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from numpy import linalg as LA\n",
    "\n",
    "class PCA_SVD:\n",
    "    # 参数n_components为保留的主成分数\n",
    "    def __init__(self, matrix, n_components=None):\n",
    "        self.matrix = matrix\n",
    "        self.n_components = matrix.shape[1] if n_components==None else n_components\n",
    "    \n",
    "    # 自定义标准化方法\n",
    "    def scale(self):\n",
    "        def scale_vector(vector):\n",
    "            delta = vector - np.mean(vector)\n",
    "            std = np.std(vector, ddof=0)\n",
    "            return delta / std\n",
    "        matrix_scaled = np.apply_along_axis(arr=self.matrix, func1d=scale_vector, axis=0)\n",
    "        return matrix_scaled\n",
    "     \n",
    "    # 对标准化后的矩阵进行奇异值分解    \n",
    "    def matrix_svd(self):\n",
    "        # 令A为m*n型矩阵，则U、V分别为m阶、n阶正交矩阵\n",
    "        # U的每一个列向量都是A*A.T的特征向量，也称为左奇异向量\n",
    "        # V的每一个行向量都是A.T*A的特征向量，也称为右奇异向量\n",
    "        # sigma是由k个降序排列的奇异值构成的向量，其中k = min(matrix.shape)\n",
    "        U, sigma, V =  LA.svd(self.matrix) \n",
    "        \n",
    "        # 非零奇异值的个数不会超过原矩阵的秩，从而不会超过矩阵维度的最小值\n",
    "        assert len(sigma) == min(self.matrix.shape)\n",
    "        return U, sigma, V \n",
    "    \n",
    "    # 通过矩阵V进行PCA，返回最终降维后的矩阵\n",
    "    def pca_result(self):\n",
    "        sigma, V = self.matrix_svd()[1], self.matrix_svd()[2]\n",
    "        \n",
    "        # 奇异值的平方等于(A^T)*A的特征值\n",
    "        eigen_values = np.square(sigma[:self.n_components]) / (self.matrix.shape[0]-1)\n",
    "        \n",
    "        # Q为投影矩阵，由V的前n_components个行向量转置后得到\n",
    "        Q = V[:self.n_components, :].T\n",
    "        \n",
    "        # 计算标准化后的矩阵在Q上的投影，得到PCA的结果\n",
    "        matrix_pca = np.dot(self.scale(), Q)\n",
    "        # matrix_pca的列数应等于保留的主成分数\n",
    "        assert matrix_pca.shape[1] == self.n_components\n",
    "        return matrix_pca, eigen_values, Q.T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 调用sklearn实现的PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.datasets import load_wine\n",
    "\n",
    "X = load_wine().data\n",
    "row, col = X.shape\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 验证结果表明：sklearn通过矩阵的奇异值分解实现PCA，而不是矩阵的特征分解"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def verify(n_components, dataset = X_scaled):\n",
    "    # 返回sklearn的PCA结果\n",
    "    pca_sklearn = PCA(n_components=n_components)\n",
    "    sklearn_matrix = pca_sklearn.fit_transform(dataset)\n",
    "    sklearn_eigenvalue = pca_sklearn.explained_variance_\n",
    "    sklearn_eigenvector = pca_sklearn.components_\n",
    "    \n",
    "    # 返回SVD的PCA结果\n",
    "    pca_custom = PCA_SVD(dataset, n_components=n_components)\n",
    "    pca_custom_matrix, pca_custom_eigenvalue, pca_custom_eigenvector = pca_custom.pca_result()\n",
    "    \n",
    "    # 验证\n",
    "    verify_eigenvalue = np.allclose(abs(sklearn_eigenvalue), abs(pca_custom_eigenvalue))\n",
    "    verify_eigenvector = np.allclose(abs(sklearn_eigenvector), abs(pca_custom_eigenvector))\n",
    "    verify_result = np.allclose(abs(sklearn_matrix), abs(pca_custom_matrix))  \n",
    "    \n",
    "    verify_bool = all([verify_eigenvalue, verify_eigenvector, verify_result])\n",
    "    return verify_bool"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all(map(verify, range(1, col+1)))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}