{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Empirical evaluation of the impact of k-means initialization\n\nEvaluate the ability of k-means initializations strategies to make\nthe algorithm convergence robust, as measured by the relative standard\ndeviation of the inertia of the clustering (i.e. the sum of squared\ndistances to the nearest cluster center).\n\nThe first plot shows the best inertia reached for each combination\nof the model (``KMeans`` or ``MiniBatchKMeans``), and the init method\n(``init=\"random\"`` or ``init=\"k-means++\"``) for increasing values of the\n``n_init`` parameter that controls the number of initializations.\n\nThe second plot demonstrates one single run of the ``MiniBatchKMeans``\nestimator using a ``init=\"random\"`` and ``n_init=1``. This run leads to\na bad convergence (local optimum), with estimated centers stuck\nbetween ground truth clusters.\n\nThe dataset used for evaluation is a 2D grid of isotropic Gaussian\nclusters widely spaced.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.cm as cm\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\nfrom sklearn.utils import check_random_state, shuffle\n\nrandom_state = np.random.RandomState(0)\n\n# Number of run (with randomly generated dataset) for each strategy so as\n# to be able to compute an estimate of the standard deviation\nn_runs = 5\n\n# k-means models can do several random inits so as to be able to trade\n# CPU time for convergence robustness\nn_init_range = np.array([1, 5, 10, 15, 20])\n\n# Datasets generation parameters\nn_samples_per_center = 100\ngrid_size = 3\nscale = 0.1\nn_clusters = grid_size**2\n\n\ndef make_data(random_state, n_samples_per_center, grid_size, scale):\n random_state = check_random_state(random_state)\n centers = np.array([[i, j] for i in range(grid_size) for j in range(grid_size)])\n n_clusters_true, n_features = centers.shape\n\n noise = random_state.normal(\n scale=scale, size=(n_samples_per_center, centers.shape[1])\n )\n\n X = np.concatenate([c + noise for c in centers])\n y = np.concatenate([[i] * n_samples_per_center for i in range(n_clusters_true)])\n return shuffle(X, y, random_state=random_state)\n\n\n# Part 1: Quantitative evaluation of various init methods\n\n\nplt.figure()\nplots = []\nlegends = []\n\ncases = [\n (KMeans, \"k-means++\", {}, \"^-\"),\n (KMeans, \"random\", {}, \"o-\"),\n (MiniBatchKMeans, \"k-means++\", {\"max_no_improvement\": 3}, \"x-\"),\n (MiniBatchKMeans, \"random\", {\"max_no_improvement\": 3, \"init_size\": 500}, \"d-\"),\n]\n\nfor factory, init, params, format in cases:\n print(\"Evaluation of %s with %s init\" % (factory.__name__, init))\n inertia = np.empty((len(n_init_range), n_runs))\n\n for run_id in range(n_runs):\n X, y = make_data(run_id, n_samples_per_center, grid_size, scale)\n for i, n_init in enumerate(n_init_range):\n km = factory(\n n_clusters=n_clusters,\n init=init,\n random_state=run_id,\n n_init=n_init,\n **params,\n ).fit(X)\n inertia[i, run_id] = km.inertia_\n p = plt.errorbar(\n n_init_range, inertia.mean(axis=1), inertia.std(axis=1), fmt=format\n )\n plots.append(p[0])\n legends.append(\"%s with %s init\" % (factory.__name__, init))\n\nplt.xlabel(\"n_init\")\nplt.ylabel(\"inertia\")\nplt.legend(plots, legends)\nplt.title(\"Mean inertia for various k-means init across %d runs\" % n_runs)\n\n# Part 2: Qualitative visual inspection of the convergence\n\nX, y = make_data(random_state, n_samples_per_center, grid_size, scale)\nkm = MiniBatchKMeans(\n n_clusters=n_clusters, init=\"random\", n_init=1, random_state=random_state\n).fit(X)\n\nplt.figure()\nfor k in range(n_clusters):\n my_members = km.labels_ == k\n color = cm.nipy_spectral(float(k) / n_clusters, 1)\n plt.plot(X[my_members, 0], X[my_members, 1], \".\", c=color)\n cluster_center = km.cluster_centers_[k]\n plt.plot(\n cluster_center[0],\n cluster_center[1],\n \"o\",\n markerfacecolor=color,\n markeredgecolor=\"k\",\n markersize=6,\n )\n plt.title(\n \"Example cluster allocation with a single random init\\nwith MiniBatchKMeans\"\n )\n\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 0 }