{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Demonstrating the different strategies of KBinsDiscretizer\n\nThis example presents the different strategies implemented in KBinsDiscretizer:\n\n- 'uniform': The discretization is uniform in each feature, which means that\n the bin widths are constant in each dimension.\n- quantile': The discretization is done on the quantiled values, which means\n that each bin has approximately the same number of samples.\n- 'kmeans': The discretization is based on the centroids of a KMeans clustering\n procedure.\n\nThe plot shows the regions where the discretized encoding is constant.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import make_blobs\nfrom sklearn.preprocessing import KBinsDiscretizer\n\nstrategies = [\"uniform\", \"quantile\", \"kmeans\"]\n\nn_samples = 200\ncenters_0 = np.array([[0, 0], [0, 5], [2, 4], [8, 8]])\ncenters_1 = np.array([[0, 0], [3, 1]])\n\n# construct the datasets\nrandom_state = 42\nX_list = [\n np.random.RandomState(random_state).uniform(-3, 3, size=(n_samples, 2)),\n make_blobs(\n n_samples=[\n n_samples // 10,\n n_samples * 4 // 10,\n n_samples // 10,\n n_samples * 4 // 10,\n ],\n cluster_std=0.5,\n centers=centers_0,\n random_state=random_state,\n )[0],\n make_blobs(\n n_samples=[n_samples // 5, n_samples * 4 // 5],\n cluster_std=0.5,\n centers=centers_1,\n random_state=random_state,\n )[0],\n]\n\nfigure = plt.figure(figsize=(14, 9))\ni = 1\nfor ds_cnt, X in enumerate(X_list):\n ax = plt.subplot(len(X_list), len(strategies) + 1, i)\n ax.scatter(X[:, 0], X[:, 1], edgecolors=\"k\")\n if ds_cnt == 0:\n ax.set_title(\"Input data\", size=14)\n\n xx, yy = np.meshgrid(\n np.linspace(X[:, 0].min(), X[:, 0].max(), 300),\n np.linspace(X[:, 1].min(), X[:, 1].max(), 300),\n )\n grid = np.c_[xx.ravel(), yy.ravel()]\n\n ax.set_xlim(xx.min(), xx.max())\n ax.set_ylim(yy.min(), yy.max())\n ax.set_xticks(())\n ax.set_yticks(())\n\n i += 1\n # transform the dataset with KBinsDiscretizer\n for strategy in strategies:\n enc = KBinsDiscretizer(n_bins=4, encode=\"ordinal\", strategy=strategy)\n enc.fit(X)\n grid_encoded = enc.transform(grid)\n\n ax = plt.subplot(len(X_list), len(strategies) + 1, i)\n\n # horizontal stripes\n horizontal = grid_encoded[:, 0].reshape(xx.shape)\n ax.contourf(xx, yy, horizontal, alpha=0.5)\n # vertical stripes\n vertical = grid_encoded[:, 1].reshape(xx.shape)\n ax.contourf(xx, yy, vertical, alpha=0.5)\n\n ax.scatter(X[:, 0], X[:, 1], edgecolors=\"k\")\n ax.set_xlim(xx.min(), xx.max())\n ax.set_ylim(yy.min(), yy.max())\n ax.set_xticks(())\n ax.set_yticks(())\n if ds_cnt == 0:\n ax.set_title(\"strategy='%s'\" % (strategy,), size=14)\n\n i += 1\n\nplt.tight_layout()\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 0 }