{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Inductive Clustering\n\nClustering can be expensive, especially when our dataset contains millions\nof datapoints. Many clustering algorithms are not :term:`inductive` and so\ncannot be directly applied to new data samples without recomputing the\nclustering, which may be intractable. Instead, we can use clustering to then\nlearn an inductive model with a classifier, which has several benefits:\n\n- it allows the clusters to scale and apply to new data\n- unlike re-fitting the clusters to new samples, it makes sure the labelling\n procedure is consistent over time\n- it allows us to use the inferential capabilities of the classifier to\n describe or explain the clusters\n\nThis example illustrates a generic implementation of a meta-estimator which\nextends clustering by inducing a classifier from the cluster labels.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.base import BaseEstimator, clone\nfrom sklearn.cluster import AgglomerativeClustering\nfrom sklearn.datasets import make_blobs\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.inspection import DecisionBoundaryDisplay\nfrom sklearn.utils.metaestimators import available_if\nfrom sklearn.utils.validation import check_is_fitted\n\nN_SAMPLES = 5000\nRANDOM_STATE = 42\n\n\ndef _classifier_has(attr):\n \"\"\"Check if we can delegate a method to the underlying classifier.\n\n First, we check the first fitted classifier if available, otherwise we\n check the unfitted classifier.\n \"\"\"\n return lambda estimator: (\n hasattr(estimator.classifier_, attr)\n if hasattr(estimator, \"classifier_\")\n else hasattr(estimator.classifier, attr)\n )\n\n\nclass InductiveClusterer(BaseEstimator):\n def __init__(self, clusterer, classifier):\n self.clusterer = clusterer\n self.classifier = classifier\n\n def fit(self, X, y=None):\n self.clusterer_ = clone(self.clusterer)\n self.classifier_ = clone(self.classifier)\n y = self.clusterer_.fit_predict(X)\n self.classifier_.fit(X, y)\n return self\n\n @available_if(_classifier_has(\"predict\"))\n def predict(self, X):\n check_is_fitted(self)\n return self.classifier_.predict(X)\n\n @available_if(_classifier_has(\"decision_function\"))\n def decision_function(self, X):\n check_is_fitted(self)\n return self.classifier_.decision_function(X)\n\n\ndef plot_scatter(X, color, alpha=0.5):\n return plt.scatter(X[:, 0], X[:, 1], c=color, alpha=alpha, edgecolor=\"k\")\n\n\n# Generate some training data from clustering\nX, y = make_blobs(\n n_samples=N_SAMPLES,\n cluster_std=[1.0, 1.0, 0.5],\n centers=[(-5, -5), (0, 0), (5, 5)],\n random_state=RANDOM_STATE,\n)\n\n\n# Train a clustering algorithm on the training data and get the cluster labels\nclusterer = AgglomerativeClustering(n_clusters=3)\ncluster_labels = clusterer.fit_predict(X)\n\nplt.figure(figsize=(12, 4))\n\nplt.subplot(131)\nplot_scatter(X, cluster_labels)\nplt.title(\"Ward Linkage\")\n\n\n# Generate new samples and plot them along with the original dataset\nX_new, y_new = make_blobs(\n n_samples=10, centers=[(-7, -1), (-2, 4), (3, 6)], random_state=RANDOM_STATE\n)\n\nplt.subplot(132)\nplot_scatter(X, cluster_labels)\nplot_scatter(X_new, \"black\", 1)\nplt.title(\"Unknown instances\")\n\n\n# Declare the inductive learning model that it will be used to\n# predict cluster membership for unknown instances\nclassifier = RandomForestClassifier(random_state=RANDOM_STATE)\ninductive_learner = InductiveClusterer(clusterer, classifier).fit(X)\n\nprobable_clusters = inductive_learner.predict(X_new)\n\n\nax = plt.subplot(133)\nplot_scatter(X, cluster_labels)\nplot_scatter(X_new, probable_clusters)\n\n# Plotting decision regions\nDecisionBoundaryDisplay.from_estimator(\n inductive_learner, X, response_method=\"predict\", alpha=0.4, ax=ax\n)\nplt.title(\"Classify unknown instances\")\n\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 0 }