#!/usr/bin/python # -*- coding: utf-8 -*- """ ========================================================= Pipelining: chaining a PCA and a logistic regression ========================================================= The PCA does an unsupervised dimensionality reduction, while the logistic regression does the prediction. We use a GridSearchCV to set the dimensionality of the PCA """ print(__doc__) # Code source: Gael Varoqueux # Modified for Documentation merge by Jaques Grobler # License: BSD 3 clause import numpy as np import pylab as pl from sklearn import linear_model, decomposition, datasets logistic = linear_model.LogisticRegression() pca = decomposition.PCA() from sklearn.pipeline import Pipeline pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target ############################################################################### # Plot the PCA spectrum pca.fit(X_digits) pl.figure(1, figsize=(4, 3)) pl.clf() pl.axes([.2, .2, .7, .7]) pl.plot(pca.explained_variance_, linewidth=2) pl.axis('tight') pl.xlabel('n_components') pl.ylabel('explained_variance_') ############################################################################### # Prediction from sklearn.grid_search import GridSearchCV n_components = [20, 40, 64] Cs = np.logspace(-4, 4, 3) #Parameters of pipelines can be set using ‘__’ separated parameter names: estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs)) estimator.fit(X_digits, y_digits) pl.axvline(estimator.best_estimator_.named_steps['pca'].n_components, linestyle=':', label='n_components chosen') pl.legend(prop=dict(size=12)) pl.show()