{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Classifier comparison\n\nA comparison of several classifiers in scikit-learn on synthetic datasets.\nThe point of this example is to illustrate the nature of decision boundaries\nof different classifiers.\nThis should be taken with a grain of salt, as the intuition conveyed by\nthese examples does not necessarily carry over to real datasets.\n\nParticularly in high-dimensional spaces, data can more easily be separated\nlinearly and the simplicity of classifiers such as naive Bayes and linear SVMs\nmight lead to better generalization than is achieved by other classifiers.\n\nThe plots show training points in solid colors and testing points\nsemi-transparent. The lower right shows the classification accuracy on the test\nset.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom matplotlib.colors import ListedColormap\n\nfrom sklearn.datasets import make_circles, make_classification, make_moons\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\nfrom sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.gaussian_process.kernels import RBF\nfrom sklearn.inspection import DecisionBoundaryDisplay\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neural_network import MLPClassifier\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\nfrom sklearn.tree import DecisionTreeClassifier\n\nnames = [\n \"Nearest Neighbors\",\n \"Linear SVM\",\n \"RBF SVM\",\n \"Gaussian Process\",\n \"Decision Tree\",\n \"Random Forest\",\n \"Neural Net\",\n \"AdaBoost\",\n \"Naive Bayes\",\n \"QDA\",\n]\n\nclassifiers = [\n KNeighborsClassifier(3),\n SVC(kernel=\"linear\", C=0.025, random_state=42),\n SVC(gamma=2, C=1, random_state=42),\n GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),\n DecisionTreeClassifier(max_depth=5, random_state=42),\n RandomForestClassifier(\n max_depth=5, n_estimators=10, max_features=1, random_state=42\n ),\n MLPClassifier(alpha=1, max_iter=1000, random_state=42),\n AdaBoostClassifier(random_state=42),\n GaussianNB(),\n QuadraticDiscriminantAnalysis(),\n]\n\nX, y = make_classification(\n n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1\n)\nrng = np.random.RandomState(2)\nX += 2 * rng.uniform(size=X.shape)\nlinearly_separable = (X, y)\n\ndatasets = [\n make_moons(noise=0.3, random_state=0),\n make_circles(noise=0.2, factor=0.5, random_state=1),\n linearly_separable,\n]\n\nfigure = plt.figure(figsize=(27, 9))\ni = 1\n# iterate over datasets\nfor ds_cnt, ds in enumerate(datasets):\n # preprocess dataset, split into training and test part\n X, y = ds\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.4, random_state=42\n )\n\n x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5\n y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5\n\n # just plot the dataset first\n cm = plt.cm.RdBu\n cm_bright = ListedColormap([\"#FF0000\", \"#0000FF\"])\n ax = plt.subplot(len(datasets), len(classifiers) + 1, i)\n if ds_cnt == 0:\n ax.set_title(\"Input data\")\n # Plot the training points\n ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors=\"k\")\n # Plot the testing points\n ax.scatter(\n X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors=\"k\"\n )\n ax.set_xlim(x_min, x_max)\n ax.set_ylim(y_min, y_max)\n ax.set_xticks(())\n ax.set_yticks(())\n i += 1\n\n # iterate over classifiers\n for name, clf in zip(names, classifiers):\n ax = plt.subplot(len(datasets), len(classifiers) + 1, i)\n\n clf = make_pipeline(StandardScaler(), clf)\n clf.fit(X_train, y_train)\n score = clf.score(X_test, y_test)\n DecisionBoundaryDisplay.from_estimator(\n clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5\n )\n\n # Plot the training points\n ax.scatter(\n X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors=\"k\"\n )\n # Plot the testing points\n ax.scatter(\n X_test[:, 0],\n X_test[:, 1],\n c=y_test,\n cmap=cm_bright,\n edgecolors=\"k\",\n alpha=0.6,\n )\n\n ax.set_xlim(x_min, x_max)\n ax.set_ylim(y_min, y_max)\n ax.set_xticks(())\n ax.set_yticks(())\n if ds_cnt == 0:\n ax.set_title(name)\n ax.text(\n x_max - 0.3,\n y_min + 0.3,\n (\"%.2f\" % score).lstrip(\"0\"),\n size=15,\n horizontalalignment=\"right\",\n )\n i += 1\n\nplt.tight_layout()\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 0 }