{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Feature agglomeration vs. univariate selection\n\nThis example compares 2 dimensionality reduction strategies:\n\n- univariate feature selection with Anova\n\n- feature agglomeration with Ward hierarchical clustering\n\nBoth methods are compared in a regression problem using\na BayesianRidge as supervised estimator.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import shutil\nimport tempfile\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom joblib import Memory\nfrom scipy import linalg, ndimage\n\nfrom sklearn import feature_selection\nfrom sklearn.cluster import FeatureAgglomeration\nfrom sklearn.feature_extraction.image import grid_to_graph\nfrom sklearn.linear_model import BayesianRidge\nfrom sklearn.model_selection import GridSearchCV, KFold\nfrom sklearn.pipeline import Pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set parameters\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "n_samples = 200\nsize = 40 # image size\nroi_size = 15\nsnr = 5.0\nnp.random.seed(0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generate data\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "coef = np.zeros((size, size))\ncoef[0:roi_size, 0:roi_size] = -1.0\ncoef[-roi_size:, -roi_size:] = 1.0\n\nX = np.random.randn(n_samples, size**2)\nfor x in X: # smooth data\n x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()\nX -= X.mean(axis=0)\nX /= X.std(axis=0)\n\ny = np.dot(X, coef.ravel())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "add noise\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "noise = np.random.randn(y.shape[0])\nnoise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.0)) / linalg.norm(noise, 2)\ny += noise_coef * noise" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute the coefs of a Bayesian Ridge with GridSearch\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cv = KFold(2) # cross-validation generator for model selection\nridge = BayesianRidge()\ncachedir = tempfile.mkdtemp()\nmem = Memory(location=cachedir, verbose=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ward agglomeration followed by BayesianRidge\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "connectivity = grid_to_graph(n_x=size, n_y=size)\nward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem)\nclf = Pipeline([(\"ward\", ward), (\"ridge\", ridge)])\n# Select the optimal number of parcels with grid search\nclf = GridSearchCV(clf, {\"ward__n_clusters\": [10, 20, 30]}, n_jobs=1, cv=cv)\nclf.fit(X, y) # set the best parameters\ncoef_ = clf.best_estimator_.steps[-1][1].coef_\ncoef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)\ncoef_agglomeration_ = coef_.reshape(size, size)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Anova univariate feature selection followed by BayesianRidge\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "f_regression = mem.cache(feature_selection.f_regression) # caching function\nanova = feature_selection.SelectPercentile(f_regression)\nclf = Pipeline([(\"anova\", anova), (\"ridge\", ridge)])\n# Select the optimal percentage of features with grid search\nclf = GridSearchCV(clf, {\"anova__percentile\": [5, 10, 20]}, cv=cv)\nclf.fit(X, y) # set the best parameters\ncoef_ = clf.best_estimator_.steps[-1][1].coef_\ncoef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))\ncoef_selection_ = coef_.reshape(size, size)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inverse the transformation to plot the results on an image\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.close(\"all\")\nplt.figure(figsize=(7.3, 2.7))\nplt.subplot(1, 3, 1)\nplt.imshow(coef, interpolation=\"nearest\", cmap=plt.cm.RdBu_r)\nplt.title(\"True weights\")\nplt.subplot(1, 3, 2)\nplt.imshow(coef_selection_, interpolation=\"nearest\", cmap=plt.cm.RdBu_r)\nplt.title(\"Feature Selection\")\nplt.subplot(1, 3, 3)\nplt.imshow(coef_agglomeration_, interpolation=\"nearest\", cmap=plt.cm.RdBu_r)\nplt.title(\"Feature Agglomeration\")\nplt.subplots_adjust(0.04, 0.0, 0.98, 0.94, 0.16, 0.26)\nplt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Attempt to remove the temporary cachedir, but don't worry if it fails\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "shutil.rmtree(cachedir, ignore_errors=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 0 }