{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Model Selection\n", "=================" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Grid-Search with build-in cross validation" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.grid_search import GridSearchCV" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define parameter grid:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma' : 10. ** np.arange(-3, 3)}\n", "print(param_grid)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.svm import SVC\n", "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A GridSearchCV object behaves just like a normal classifier." ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.datasets import load_digits\n", "from sklearn.cross_validation import train_test_split\n", "digits = load_digits()\n", "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.fit(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# We extract just the scores\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "scores = [x[1] for x in grid_search.grid_scores_]\n", "scores = np.array(scores).reshape(6, 6)\n", "\n", "plt.matshow(scores)\n", "plt.xlabel('gamma')\n", "plt.ylabel('C')\n", "plt.colorbar()\n", "plt.xticks(np.arange(6), param_grid['gamma'])\n", "plt.yticks(np.arange(6), param_grid['C'])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.best_params_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.predict(X_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.score(X_test, y_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Preprocessing and Pipelines\n", "=============================" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.preprocessing import StandardScaler" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Same interface as always." ] }, { "cell_type": "code", "collapsed": false, "input": [ "scaler = StandardScaler()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "scaler.fit(X_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "scaler.transform(X_train).mean(axis=0)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "scaler.transform(X_train).std(axis=0)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For cross-validation, we need to estimate mean and standard deviation separately for each fold.\n", "To do that, we build a pipeline." ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.pipeline import Pipeline" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline = Pipeline([(\"scaler\", scaler), (\"svm\", SVC())])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline.fit(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline.predict(X_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cross-validation with a pipeline\n", "---------------------------------" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cross_validation import cross_val_score\n", "cross_val_score(pipeline, X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So, yeah, don't forget the preprocessing." ] }, { "cell_type": "code", "collapsed": false, "input": [ "param_grid_pipeline = {'svm__C': 10. ** np.arange(-3, 3), 'svm__gamma' : 10. ** np.arange(-3, 3)}\n", "\n", "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid_pipeline, verbose=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_pipeline.fit(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# We extract just the scores\n", "scores = [x[1] for x in grid_pipeline.grid_scores_]\n", "scores = np.array(scores).reshape(6, 6)\n", "\n", "plt.matshow(scores)\n", "plt.xlabel('gamma')\n", "plt.ylabel('C')\n", "plt.colorbar()\n", "plt.xticks(np.arange(6), param_grid['gamma'])\n", "plt.yticks(np.arange(6), param_grid['C'])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_pipeline.score(X_test, y_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Randomized Searching\n", "======================" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.grid_search import RandomizedSearchCV" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from scipy.stats import expon" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.hist([expon.rvs() for x in xrange(1000)])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "params = {'C': expon(), 'gamma': expon()}\n", "rs = RandomizedSearchCV(SVC(), param_distributions=params, n_iter=50, verbose=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rs.fit(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rs.best_params_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rs.best_score_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "scores, Cs, gammas = zip(*[(score.mean_validation_score, score.parameters['C'], score.parameters['gamma']) for score in rs.grid_scores_])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.scatter(Cs, gammas, s=40, c=scores)\n", "plt.xlabel(\"C\")\n", "plt.ylabel(\"gamma\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tasks\n", "=====\n", "1. Do grid-search over a pipeline consisting of the KBest feature selection and an rbf SVM on iris." ] } ], "metadata": {} } ] }