{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## This notebook shows an application of kneed in selecting the optimal number of clusters from kmeans clustering algorithm." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Look at the K means tutorial posted [here](https://pythonprogramminglanguage.com/kmeans-elbow-method/)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEICAYAAACktLTqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEcdJREFUeJzt3X9sXWd9x/H3F8eAG9qZCpeRNKIwIRfUbgu7kPJjDFGQQe1ohJjWTN2qjiljG+OHkKGBarAJVCYjRP9ATFHLjyklFSqe6RjCVIWKsdFsTo1mILMo0CVxAnEHbgszkLrf/XGvHSc0ra/vtc/1c98vKbo+zz3nPt+ca398/ZwfT2QmkqSyPKnqAiRJ7We4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7tpQIuL+iJiPiIcjYi4i/j0i3hwRT/i9HBEXRURGxKY1rnFd+pEej+Gujej3M/Nc4NnAh4B3A7dUW5LUWQx3bViZ+WBm3gH8IXBtRFwSEVdExGREPBQRRyLi/cs2+VrjcS4ifhoRL4mI34iIr0TE/0bEAxFxa0T0L24QEe+OiJnGXwrTEXF5o/1JEXF9RHyvse1nI+L8s/WzxrtC+hWGuza8zPwP4Cjwu8DPgD8B+oErgL+IiJ2NVV/ReOzPzKdl5jeAAG4EtgDPB7YB7weIiEHgLcCLGn8pDAH3N17jrcBO4Pca2/4E+Njj9COtK8NdpTgGnJ+Zd2fmVGY+mpn/BeynHsCPKTPvy8w7M/MXmTkLfGTZ+gvAU4AXRERvZt6fmd9rPPfnwHsz82hm/oL6L4Q3Os6uTmG4qxRbgR9HxI6I+GpEzEbEg8CbgWecbaOIuCAibmsMvTwE7FtcPzPvA95OPbhPNNbb0tj02cA/NQ7qzgGHqP8yeOZa/QelZhju2vAi4kXUw/3rwGeAO4BtmflrwD9QH3oBeKxboN7YaP/NzDwPuGbZ+mTmZzLz5dTDPIG/bzx1BHhdZvYv+/fUzJw5Sz/SujLctWFFxHkRcSVwG7AvM6eAc4EfZ+bPI+LFwB8t22QWeBR47rK2c4GfUj/4uRUYXvb6gxHxqoh4CvBzYJ76p3Oo/9L4YEQ8u7HuQERc9Tj9SOvKcNdG9M8R8TD1T8/vpT5Ofl3jub8E/q7x/N8An13cKDP/D/gg8G+N4ZTLgL8FXgg8CPwLMLqsn6dQP9XyAeCHwAXAexrP3UT9L4QvN/q6B9jxOP1I6yqcrEOSyuMnd0kq0Eou2f5ERJyIiG8tazs/Iu6MiO82Hp++tmVKkpqxkk/unwJee0bb9cBdmfk84K7GsiSpQ6xozD0iLgK+kJmXNJangVdm5vGIeBZwd2YOrmWhkqSVW+3VdM/MzOMAjYC/4GwrRsRuYDfA5s2bf+fiiy9eZZeS1J0OHjz4QGYONLPNml8qnZl7gb0AtVotJyYm1rpLSSpKRPxPs9us9myZHzWGY2g8nljl60iS1sBqw/0O4NrG19cCn29POZKkdljJqZD7gW8AgxFxNCLeRP2qvddExHeB1zSWJUkd4gnH3DNz11meurzNtUiS2sQrVCWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAJtamXjiHgH8GdAAlPAdZn583YUJnWTsckZRsanOTY3z5b+PoaHBtm5fWvVZWkDW/Un94jYCrwVqGXmJUAPcHW7CpO6xdjkDHtGp5iZmyeBmbl59oxOMTY5U3Vp2sBaHZbZBPRFxCbgHOBY6yVJ3WVkfJr5kwuntc2fXGBkfLqiilSCVYd7Zs4AHwYOA8eBBzPzy2euFxG7I2IiIiZmZ2dXX6lUqGNz8021SyvRyrDM04GrgOcAW4DNEXHNmetl5t7MrGVmbWBgYPWVSoXa0t/XVLu0Eq0My7wa+EFmzmbmSWAUeGl7ypK6x/DQIH29Pae19fX2MDw0WFFFKkErZ8scBi6LiHOAeeByYKItVUldZPGsGM+WUTutOtwz80BE3A7cCzwCTAJ721WY1E12bt9qmKutWjrPPTPfB7yvTbVIktrEK1QlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIK1NJdISW1xw1jU+w/cISFTHoi2LVjGx/YeWnVZWkDM9ylit0wNsW+ew4vLS9kLi0b8Foth2Wkiu0/cKSpdmklDHepYguZTbVLK2G4SxXriWiqXVoJw12q2K4d25pql1bCA6pSxRYPmnq2jNopch3H9Wq1Wk5MTKxbf5JUgog4mJm1ZrZxWEaSCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalALd3yNyL6gZuBS4AE/jQzv9GOwko1NjnDyPg0x+bm2dLfx/DQIDu3b626rK7VKe9Hp9ShcrR6P/ebgC9l5hsj4snAOW2oqVhjkzPsGZ1i/uQCADNz8+wZnQLwB7kCnfJ+dEodKsuqh2Ui4jzgFcAtAJn5y8yca1dhJRoZn176AV40f3KBkfHpiirqbp3yfnRKHSpLK2PuzwVmgU9GxGRE3BwRm89cKSJ2R8REREzMzs620N3Gd2xuvql2ra1OeT86pQ6VpZVw3wS8EPh4Zm4HfgZcf+ZKmbk3M2uZWRsYGGihu41vS39fU+1aW53yfnRKHSpLK+F+FDiamQcay7dTD3udxfDQIH29Pae19fX2MDw0WFFF3a1T3o9OqUNlWfUB1cz8YUQciYjBzJwGLge+077SyrN4cMyzIjpDp7wfnVKHytLSBNkR8dvUT4V8MvB94LrM/MnZ1neCbElq3momyG7pVMjM/CbQVIeSpLXnFaqSVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFanWC7A3jhrEp9h84wkImPRHs2rGND+y8tOqyKjE2OeO9w6XCdUW43zA2xb57Di8tL2QuLXdbwI9NzrBndGppQuaZuXn2jE4BGPBSQbpiWGb/gSNNtZdsZHx6KdgXzZ9cYGR8uqKKJK2Frgj3hbPMNnW29pIdm5tvql3SxtQV4d4T0VR7ybb09zXVLmlj6opw37VjW1PtJRseGqSvt+e0tr7eHoaHBiuqSNJa6IoDqosHTT1b5tRBU8+WkcoWuY7jzrVaLScmJtatP0kqQUQczMxaM9t0xbCMJHUbw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalALd/PPSJ6gAlgJjOvbL2ktTE2OeM9zCV1jXZM1vE24BBwXhtea02MTc6wZ3RqaWLombl59oxOARjwkorU0rBMRFwIXAHc3J5y1sbI+PRSsC+aP7nAyPh0RRVJ0tpqdcz9o8C7gEfPtkJE7I6IiYiYmJ2dbbG71Tk2N99UuyRtdKsO94i4EjiRmQcfb73M3JuZtcysDQwMrLa7lmzp72uqXZI2ulY+ub8MeH1E3A/cBrwqIva1pao2Gx4apK+357S2vt4ehocGK6pIktbWqsM9M/dk5oWZeRFwNfCVzLymbZW10c7tW7nxDZeytb+PALb293HjGy71YKqkYrXjbJkNYef2rYa5pK7RlnDPzLuBu9vxWpKk1nmFqiQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoG65pa/kjaGsckZRsanOTY3z5b+PoaHBtf9dt2dUEOrDHdJHWNscoY9o1NLE9rPzM2zZ3QKYN3CtRNqaAeHZSR1jJHx6aVQXTR/coGR8emuqqEdDHdJHePY3HxT7aXW0A6Gu6SOsaW/r6n2UmtoB8NdUscYHhqkr7fntLa+3h6Ghwa7qoZ28ICqpI6xeMCyyjNVOqGGdojMXLfOarVaTkxMrFt/klSCiDiYmbVmtnFYRpIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCrvuVvRGwD/hH4deBRYG9m3tSuwiR1pxvGpth/4AgLmfREsGvHNj6w89J1raHbJ8h+BHhnZt4bEecCByPizsz8Tptqk9RlbhibYt89h5eWFzKXltcr4Lt+guzMPJ6Z9za+fhg4BGyc/7mkjrP/wJGm2teCE2QvExEXAduBA4/x3O6ImIiIidnZ2XZ0J6lQC2eZPOhs7WvBCbIbIuJpwOeAt2fmQ2c+n5l7M7OWmbWBgYFWu5NUsJ6IptrXghNkAxHRSz3Yb83M0faUJKlb7dqxran2tdD1E2RHRAC3AIcy8yPtK0lSt1o8aFrl2TJdP0F2RLwc+FdgivqpkADvycwvnm0bJ8iWpOatZoLsVX9yz8yvA+s3ECZJWjGvUJWkAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSpQKxNkSyrI2OTMhr+HebuUsC8Md0mMTc6wZ3RqaWLombl59oxOAWy4UGtVKfvCYRlJjIxPL4XZovmTC4yMT1dUUXVK2ReGuySOzc031V6yUvaF4S6JLf19TbWXrJR9YbhLYnhokL7entPa+np7GB4arKii6pSyLzygKmnpQOFGP0OkHUrZF5GZ69ZZrVbLiYmJdetPkkoQEQczs9bMNg7LSFKBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCtRTuEfHaiJiOiPsi4vp2FSVJas2qwz0ieoCPAa8DXgDsiogXtKswSdLqtfLJ/cXAfZn5/cz8JXAbcFV7ypIktaKVafa2AkeWLR8Fdpy5UkTsBnY3Fn8REd9qoc+SPAN4oOoiOoT74hT3xSnui1OansC1lXCPx2j7lTn7MnMvsBcgIiaanSqqVO6LU9wXp7gvTnFfnBIRTc9P2sqwzFFg27LlC4FjLbyeJKlNWgn3/wSeFxHPiYgnA1cDd7SnLElSK1Y9LJOZj0TEW4BxoAf4RGZ++wk227va/grkvjjFfXGK++IU98UpTe+LyPyVYXJJ0gbnFaqSVCDDXZIKtC7h7m0K6iJiW0R8NSIORcS3I+JtVddUtYjoiYjJiPhC1bVUKSL6I+L2iPjvxvfHS6quqSoR8Y7Gz8e3ImJ/RDy16prWS0R8IiJOLL8eKCLOj4g7I+K7jcenr+S11jzcvU3BaR4B3pmZzwcuA/6qi/fForcBh6ouogPcBHwpMy8Gfosu3ScRsRV4K1DLzEuon6xxdbVVratPAa89o+164K7MfB5wV2P5Ca3HJ3dvU9CQmccz897G1w9T/wHeWm1V1YmIC4ErgJurrqVKEXEe8ArgFoDM/GVmzlVbVaU2AX0RsQk4hy66fiYzvwb8+Izmq4BPN77+NLBzJa+1HuH+WLcp6NpAWxQRFwHbgQPVVlKpjwLvAh6tupCKPReYBT7ZGKK6OSI2V11UFTJzBvgwcBg4DjyYmV+utqrKPTMzj0P9AyJwwUo2Wo9wX9FtCrpJRDwN+Bzw9sx8qOp6qhARVwInMvNg1bV0gE3AC4GPZ+Z24Ges8E/v0jTGk68CngNsATZHxDXVVrUxrUe4e5uCZSKil3qw35qZo1XXU6GXAa+PiPupD9W9KiL2VVtSZY4CRzNz8a+426mHfTd6NfCDzJzNzJPAKPDSimuq2o8i4lkAjccTK9loPcLd2xQ0RERQH1c9lJkfqbqeKmXmnsy8MDMvov498ZXM7MpPaJn5Q+BIRCze+e9y4DsVllSlw8BlEXFO4+flcrr04PIydwDXNr6+Fvj8SjZq5a6QK7LK2xSU6mXAHwNTEfHNRtt7MvOLFdakzvDXwK2ND0DfB66ruJ5KZOaBiLgduJf62WWTdNFtCCJiP/BK4BkRcRR4H/Ah4LMR8Sbqv/z+YEWv5e0HJKk8XqEqSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KB/h+XixK50q+dIAAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# clustering dataset\n", "# determine k using elbow method\n", " \n", "from sklearn.cluster import KMeans\n", "from sklearn import metrics\n", "from scipy.spatial.distance import cdist\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", " \n", "x1 = np.array([3, 1, 1, 2, 1, 6, 6, 6, 5, 6, 7, 8, 9, 8, 9, 9, 8])\n", "x2 = np.array([5, 4, 5, 6, 5, 8, 6, 7, 6, 7, 1, 2, 1, 2, 3, 2, 3])\n", " \n", "plt.plot()\n", "plt.xlim([0, 10])\n", "plt.ylim([0, 10])\n", "plt.title('Dataset')\n", "plt.scatter(x1, x2)\n", "plt.show()\n", " \n", "# create new plot and data\n", "plt.plot()\n", "X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)\n", "colors = ['b', 'g', 'r']\n", "markers = ['o', 'v', 's']\n", " \n", "# k means determine k\n", "distortions = []\n", "K = range(1,10)\n", "for k in K:\n", " kmeanModel = KMeans(n_clusters=k).fit(X)\n", " kmeanModel.fit(X)\n", " distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])\n", "\n", "# Plot the elbow\n", "plt.plot(K, distortions, 'bx-')\n", "plt.xlabel('k')\n", "plt.ylabel('Distortion')\n", "plt.title('The Elbow Method showing the optimal k')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## focus on the second plot showing the distortion as a function of k clusters" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "range(1, 10) \n", " [3.4577032384495707, 1.7687413573405673, 0.8819889697423957, 0.7587138847606585, 0.6635212812400347, 0.5808803063754726, 0.5093717077076824, 0.42618267462691206, 0.3529411764705882]\n" ] } ], "source": [ "print(K, '\\n', distortions)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Use kneed to determine best k" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append('..')\n", "\n", "from kneed import KneeLocator" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "kn = KneeLocator(list(K), distortions, S=1.0, curve='convex', direction='decreasing')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kn.knee" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.xlabel('k')\n", "plt.ylabel('Distortion')\n", "plt.title('The Elbow Method showing the optimal k')\n", "plt.plot(K, distortions, 'bx-')\n", "plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }