{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Pipelining: chaining a PCA and a logistic regression\n\nThe PCA does an unsupervised dimensionality reduction, while the logistic\nregression does the prediction.\n\nWe use a GridSearchCV to set the dimensionality of the PCA\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport polars as pl\n\nfrom sklearn import datasets\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import StandardScaler\n\n# Define a pipeline to search for the best combination of PCA truncation\n# and classifier regularization.\npca = PCA()\n# Define a Standard Scaler to normalize inputs\nscaler = StandardScaler()\n\n# set the tolerance to a large value to make the example faster\nlogistic = LogisticRegression(max_iter=10000, tol=0.1)\npipe = Pipeline(steps=[(\"scaler\", scaler), (\"pca\", pca), (\"logistic\", logistic)])\n\nX_digits, y_digits = datasets.load_digits(return_X_y=True)\n# Parameters of pipelines can be set using '__' separated parameter names:\nparam_grid = {\n \"pca__n_components\": [5, 15, 30, 45, 60],\n \"logistic__C\": np.logspace(-4, 4, 4),\n}\nsearch = GridSearchCV(pipe, param_grid, n_jobs=2)\nsearch.fit(X_digits, y_digits)\nprint(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\nprint(search.best_params_)\n\n# Plot the PCA spectrum\npca.fit(X_digits)\n\nfig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))\nax0.plot(\n np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, \"+\", linewidth=2\n)\nax0.set_ylabel(\"PCA explained variance ratio\")\n\nax0.axvline(\n search.best_estimator_.named_steps[\"pca\"].n_components,\n linestyle=\":\",\n label=\"n_components chosen\",\n)\nax0.legend(prop=dict(size=12))\n\n# For each number of components, find the best classifier results\ncomponents_col = \"param_pca__n_components\"\nis_max_test_score = pl.col(\"mean_test_score\") == pl.col(\"mean_test_score\").max()\nbest_clfs = (\n pl.LazyFrame(search.cv_results_)\n .filter(is_max_test_score.over(components_col))\n .unique(components_col)\n .sort(components_col)\n .collect()\n)\nax1.errorbar(\n best_clfs[components_col],\n best_clfs[\"mean_test_score\"],\n yerr=best_clfs[\"std_test_score\"],\n)\nax1.set_ylabel(\"Classification accuracy (val)\")\nax1.set_xlabel(\"n_components\")\n\nplt.xlim(-1, 70)\n\nplt.tight_layout()\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 0 }