{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from lets_plot import *\n", "\n", "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", "from sklearn.preprocessing import scale\n", "from sklearn.metrics.pairwise import check_pairwise_arrays\n", "from sklearn.metrics import euclidean_distances\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "X, y = make_moons(noise = 0.1, n_samples = 1000)\n", "X = scale(X)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "data = {'x1':X.T[0], 'x2':X.T[1], 'target':y}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = ggplot() \\\n", " + geom_point(aes(x='x1', y='x2', fill='target', group='target'), \n", " data=data, \n", " size = 3, \n", " shape = 21, \n", " color='black') \\\n", " + scale_fill_manual(values=['red', 'blue']) \\\n", " + theme(legend_position='none')\n", "p" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def make_grid(X, grid_size=40):\n", " min_val, max_val = np.min(X), np.max(X)\n", " delta = (max_val - min_val) / grid_size\n", " xx = yy = np.arange(min_val, min_val + delta * (grid_size + 1), delta)\n", " grid = np.vstack(np.meshgrid(xx, yy)).reshape(2, - 1).T\n", " return grid" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "grid_size = 40\n", "grid = make_grid(X, grid_size=grid_size)\n", "data_grid = pd.DataFrame(grid, columns=['x1', 'x2'])\n", "data_grid['line'] = np.tile(np.arange(grid_size + 1), grid_size + 1)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p += geom_path(aes(x='x1', y='x2', group='line'), data=data_grid, color='blue', alpha=0.5)\n", "p" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def kernel_rbf(x, xi = None, gamma = None):\n", " x, xi = check_pairwise_arrays(x, xi)\n", " if gamma == None:\n", " gamma = 1.0 / x.shape[1]\n", " K = - gamma * euclidean_distances (x, xi, squared = True)\n", " return np.exp (K)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def kernel_poly(x, xi = None, degree = 3, gamma = None, coef0 = 1):\n", " x, xi = check_pairwise_arrays(x, xi)\n", " if gamma is None:\n", " gamma = 1.0 / x.shape[1]\n", " return (coef0 + gamma * np.dot (x, xi.T)) ** degree" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def kernel_centerer(K):\n", " n = K.shape[0]\n", " In = np.ones((n, n)) / n\n", " Kc = K - np.dot (In, K) - np.dot (K, In) + np.dot (np.dot (In, K), In)\n", " return Kc" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def kernel_eig(K, centered = True):\n", " if not centered:\n", " K = kernel_centerer (K)\n", " eig_val, eig_vec = np.linalg.eigh (K)\n", " # sort eigen values in descending order\n", " idx = np.argsort (eig_val)[::- 1]\n", " eig_val = eig_val[idx]\n", " eig_vec = eig_vec[:, idx]\n", " # zero eigenvectors with zero eigenvalues\n", " eig_vec = eig_vec[:, eig_val > 0]\n", " eig_val = eig_val[eig_val > 0]\n", " return eig_val, eig_vec" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "class KernelTransformer(object):\n", "\n", " def __init__(self, kernel = 'rbf', gamma = None, degree = 3, coef0 = 1):\n", " self.gamma = gamma\n", " self.degree = degree\n", " self.coef0 = coef0\n", " self.kernel = kernel\n", "\n", " def fit(self, X, y):\n", " self.X = X\n", " self.y = y\n", " # compute eigen vectors\n", " K = self.get_kernel (self.X)\n", " self.eig_val, self.eig_vec = kernel_eig(K, centered=False)\n", " return self\n", "\n", " def get_kernel(self, x, xi = None):\n", " x, xi = check_pairwise_arrays(x, xi)\n", " if self.kernel == 'poly':\n", " K = kernel_poly (x, xi, self.degree, self.gamma, self.coef0)\n", " elif self.kernel == 'linear':\n", " self.degree = 1\n", " self.coef0 = 0\n", " self.gamma = 1\n", " K = kernel_poly (x, xi, self.degree, self.gamma, self.coef0)\n", " elif self.kernel == 'rbf':\n", " K = kernel_rbf (x, xi, self.gamma)\n", " return K\n", "\n", " def transform(self, X):\n", " K = self.get_kernel (X, self.X)\n", " return np.dot(K, self.eig_vec / np.sqrt(self.eig_val))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "transformer = KernelTransformer(kernel='rbf', gamma=2, degree=3, coef0=0.)\n", "transformer.fit(X, y);" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "X_transformed = transformer.transform(X)[:,:2]\n", "feature_data = {'x1':X_transformed[:, 0], 'x2':X_transformed[:, 1], 'target':y}" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = ggplot() \\\n", " + geom_point(aes(x='x1', y='x2', fill='target', group='target'), \n", " data=feature_data, \n", " size = 3, \n", " shape = 21, \n", " color='black') \\\n", " + scale_fill_manual(values =['red', 'blue']) \\\n", " + theme(legend_position='none')\n", "p" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "transformed_grid = transformer.transform(grid)\n", "transformed_grid = transformed_grid[:,:2]\n", "data_grid = pd.DataFrame (transformed_grid, columns=['x1', 'x2'])\n", "data_grid['line'] = np.tile(np.arange(grid_size + 1), grid_size + 1)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p += geom_path(aes(x='x1', y='x2', group='line'), data=data_grid, color='blue', alpha=0.5)\n", "p" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.25)\n", "dat = {'x':X_train.T[0], 'y':X_train.T[1], 'variable':y_train}" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p = ggplot() \\\n", " + geom_point(aes (x='x1', y='x2', fill='target', group='target'), \n", " data=data, \n", " size = 3, \n", " shape = 21, \n", " color='black') \\\n", " + scale_fill_manual(values =['red', 'blue']) \\\n", " + theme(legend_position='none')\n", "p" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def step_function(x, margin = 0, label =[0, 1]):\n", " return np.where (x >= margin, label[1], label[0])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "class KernelClassifier(object):\n", "\n", " def __init__(self, kernel = 'rbf', gamma = None, degree = 3, coef0 = 1):\n", " self.gamma = gamma\n", " self.degree = degree\n", " self.coef0 = coef0\n", " self.kernel = kernel\n", "\n", " def fit(self, X, y):\n", " self.X = X\n", " self.y = y\n", " # count number of points in subsets\n", " self.n = np.array([np.count_nonzero(1 + np.array (np.where (self.y == self.y[i]))) for i in range(len(self.y))])\n", " # compute bias\n", " a = self.get_kernel(self.X, self.X)\n", " b = step_function(self.y, 0.5,[1, - 1]) / self.n ** 2 / 2.0\n", " self.b = np.sum(np.multiply (a, b))\n", " return self\n", "\n", " def get_kernel(self, x, xi):\n", " if self.kernel == 'poly':\n", " return kernel_poly(x, xi, self.degree, self.gamma, self.coef0)\n", " elif self.kernel == 'linear':\n", " self.degree = 1\n", " self.coef0 = 0\n", " self.gamma = 1\n", " return kernel_poly(x, xi, self.degree, self.gamma, self.coef0)\n", " elif self.kernel == 'rbf':\n", " return kernel_rbf (x, xi, self.gamma)\n", "\n", " def predict(self, X):\n", " a = self.get_kernel (X, self.X)\n", " b = step_function (self.y, 0.5,[- 1, 1]) / self.n\n", " y = np.sum(np.multiply (a, b), axis = 1) + self.b\n", " return step_function (y)\n", "\n", " def decision_function(self, X):\n", " a = self.get_kernel (X, self.X)\n", " b = step_function (self.y, 0.5,[- 1, 1]) / self.n\n", " y = np.sum (np.multiply (a, b), axis = 1) + self.b\n", " return y" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "classifier = KernelClassifier(kernel='rbf', gamma=6.)\n", "classifier.fit(X_train, y_train);" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def get_grid_points(X, resolution=0.05):\n", " x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n", " x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n", " return np.meshgrid(np.arange(x1_min, x1_max, resolution), \n", " np.arange(x2_min, x2_max, resolution))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "xx1, xx2 = get_grid_points(X_train)\n", "Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)\n", "xx1, xx2, z = xx1.reshape(- 1), xx2.reshape(- 1), Z.reshape(- 1)\n", "region_data = {'x1':xx1, 'x2':xx2, 'region':z + 1}" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p += geom_raster(aes(x='x1', y='x2',group='region', fill='region'), \n", " data=region_data, alpha=0.3) \\\n", " + scale_fill_manual(values =['red', 'blue'])\n", "p += theme(axis_text='blank', axis_ticks='blank', axis_line='blank', axis_title='blank', legend_position='none')\n", "p += ggsize(500, 500)\n", "p" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "test_data = pd.DataFrame(np.column_stack ((X_test, y_test)), columns =['x1', 'x2', 'target'])" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p += geom_point(aes(x='x1', y='x2', color='target'), \n", " data=test_data, \n", " size=5, \n", " alpha=0.5, shape=16) \\\n", " + scale_color_manual(values =['red', 'blue'])\n", "p" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 2 }