{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from scipy.linalg import svd\n", "from sklearn.datasets import load_iris\n", "from sklearn.decomposition import PCA as skPCA" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Implementation 1\n", "- based on svd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class PCA():\n", " def __init__(self, n_components=2):\n", " self.n_components = n_components\n", "\n", " def fit(self, X):\n", " U, S, V = svd(X, full_matrices=False)\n", " self.components_ = V[:self.n_components]\n", " self.explained_variance_ratio_ = np.square(S[:self.n_components]) / np.sum(np.square(S))\n", " return self\n", "\n", " def transform(self, X):\n", " return np.dot(X, self.components_.T)\n", "\n", " def inverse_transform(self, X):\n", " return np.dot(X, self.components_)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "X, _ = load_iris(return_X_y=True)\n", "X -= np.mean(X, axis=0)\n", "pca1 = PCA().fit(X)\n", "Xt1 = pca1.transform(X)\n", "Xinv1 = pca1.inverse_transform(Xt1)\n", "pca2 = skPCA(n_components=2).fit(X)\n", "Xt2 = pca2.transform(X)\n", "Xinv2 = pca2.inverse_transform(Xt2)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "for i in range(pca1.components_.shape[0]):\n", " assert np.allclose(pca1.components_[i], pca2.components_[i]) or np.allclose(pca1.components_[i], -pca2.components_[i])\n", "assert np.allclose(pca1.explained_variance_ratio_, pca2.explained_variance_ratio_)\n", "for i in range(Xt1.shape[1]):\n", " assert np.allclose(Xt1[:, i], Xt2[:, i]) or np.allclose(Xt1[:, i], -Xt2[:, i])\n", "assert np.allclose(Xinv1, Xinv2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Implementation 2\n", "- based on svd\n", "- center the dataset before svd\n", "- similar to scikit-learn" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "class PCA():\n", " def __init__(self, n_components=2):\n", " self.n_components = n_components\n", "\n", " def fit(self, X):\n", " self.mean_ = np.mean(X, axis=0)\n", " X_train = X - self.mean_\n", " U, S, V = svd(X_train, full_matrices=False)\n", " self.components_ = V[:self.n_components]\n", " self.explained_variance_ratio_ = np.square(S[:self.n_components]) / np.sum(np.square(S))\n", " return self\n", "\n", " def transform(self, X):\n", " X_train = X - self.mean_\n", " return np.dot(X_train, self.components_.T)\n", "\n", " def inverse_transform(self, X):\n", " return np.dot(X, self.components_) + self.mean_" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "X, _ = load_iris(return_X_y=True)\n", "pca1 = PCA().fit(X)\n", "Xt1 = pca1.transform(X)\n", "Xinv1 = pca1.inverse_transform(Xt1)\n", "pca2 = skPCA(n_components=2).fit(X)\n", "Xt2 = pca2.transform(X)\n", "Xinv2 = pca2.inverse_transform(Xt2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "for i in range(pca1.components_.shape[0]):\n", " assert np.allclose(pca1.components_[i], pca2.components_[i]) or np.allclose(pca1.components_[i], -pca2.components_[i])\n", "assert np.allclose(pca1.explained_variance_ratio_, pca2.explained_variance_ratio_)\n", "for i in range(Xt1.shape[1]):\n", " assert np.allclose(Xt1[:, i], Xt2[:, i]) or np.allclose(Xt1[:, i], -Xt2[:, i])\n", "assert np.allclose(Xinv1, Xinv2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Implementation 3\n", "- based on covariance matrix" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "class PCA():\n", " def __init__(self, n_components=2):\n", " self.n_components = n_components\n", "\n", " def fit(self, X):\n", " covmat = np.cov(X, rowvar=False)\n", " eigval, eigvec = np.linalg.eig(covmat)\n", " idx = eigval.argsort()[::-1]\n", " self.components_ = eigvec[idx[:self.n_components]]\n", " self.explained_variance_ratio_ = np.square(eigval[idx[:self.n_components]]) / np.sum(np.square(eigval))\n", " return self\n", "\n", " def transform(self, X):\n", " return np.dot(X, self.components_.T)\n", "\n", " def inverse_transform(self, X):\n", " return np.dot(X, self.components_)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "X, _ = load_iris(return_X_y=True)\n", "X -= np.mean(X, axis=0)\n", "pca1 = PCA().fit(X)\n", "Xt1 = pca1.transform(X)\n", "Xinv1 = pca1.inverse_transform(Xt1)\n", "pca2 = skPCA(n_components=2).fit(X)\n", "Xt2 = pca2.transform(X)\n", "Xinv2 = pca2.inverse_transform(Xt2)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }