{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Model Evaluation\n", "==================\n", "\n", "Cross-validation\n", "-----------------" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cross_validation import cross_val_score, train_test_split" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.datasets import load_digits" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "digits = load_digits()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.svm import SVC" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=1), X_train, y_train, cv=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=10), X_train, y_train, cv=3, scoring=\"f1\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's go to a binary task for a moment (even vs uneven)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"average_precision\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"roc_auc\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are other ways to do cross-valiation" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cross_validation import ShuffleSplit\n", "cross_val_score(SVC(C=10), X_train, y_train, cv=ShuffleSplit(len(X_train), 10, test_size=.4))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Model Selection\n", "=================" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Grid-Search with build-in cross validation" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.grid_search import GridSearchCV" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define parameter grid:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma' : 10. ** np.arange(-3, 3)}\n", "print(param_grid)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search = GridSearchCV(SVC(), param_grid, verbose=3, n_jobs=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A GridSearchCV object behaves just like a normal classifier." ] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.fit(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# We extract just the scores\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "scores = [x[1] for x in grid_search.grid_scores_]\n", "scores = np.array(scores).reshape(6, 6)\n", "\n", "plt.matshow(scores)\n", "plt.xlabel('gamma')\n", "plt.ylabel('C')\n", "plt.colorbar()\n", "plt.xticks(np.arange(6), param_grid['gamma'])\n", "plt.yticks(np.arange(6), param_grid['C'])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.best_params_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.predict(X_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.score(X_test, y_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Preprocessing and Pipelines\n", "=============================" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.preprocessing import StandardScaler" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Same interface as always." ] }, { "cell_type": "code", "collapsed": false, "input": [ "scaler = StandardScaler()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "scaler.fit(X_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "scaler.transform(X_train).mean(axis=0)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "scaler.transform(X_train).std(axis=0)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For cross-validation, we need to estimate mean and standard deviation separately for each fold.\n", "To do that, we build a pipeline." ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.pipeline import Pipeline" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline = Pipeline([(\"scaler\", scaler), (\"svm\", SVC())])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline.fit(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline.predict(X_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cross-validation with a pipeline\n", "---------------------------------" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(pipeline, X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So, yeah, don't forget the preprocessing." ] }, { "cell_type": "code", "collapsed": false, "input": [ "param_grid_pipeline = {'svm__C': 10. ** np.arange(-3, 3), 'svm__gamma' : 10. ** np.arange(-3, 3)}\n", "\n", "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid_pipeline, verbose=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_pipeline.fit(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# We extract just the scores\n", "scores = [x[1] for x in grid_pipeline.grid_scores_]\n", "scores = np.array(scores).reshape(6, 6)\n", "\n", "plt.matshow(scores)\n", "plt.xlabel('gamma')\n", "plt.ylabel('C')\n", "plt.colorbar()\n", "plt.xticks(np.arange(6), param_grid['gamma'])\n", "plt.yticks(np.arange(6), param_grid['C'])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_pipeline.score(X_test, y_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Randomized Searching\n", "======================" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.grid_search import RandomizedSearchCV" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from scipy.stats import expon" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.hist([expon.rvs() for x in xrange(1000)])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "params = {'svm__C': expon(), 'svm__gamma': expon()}\n", "rs = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=50, verbose=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rs.fit(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rs.best_params_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rs.best_score_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "scores, Cs, gammas = zip(*[(score.mean_validation_score, score.parameters['svm__C'], score.parameters['svm__gamma']) for score in rs.grid_scores_])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.scatter(Cs, gammas, s=40, c=scores)\n", "plt.xlabel(\"C\")\n", "plt.ylabel(\"gamma\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X = np.random.normal(size=(50, 40))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "y = np.random.randint(2, size=50)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "y" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_selection import SelectKBest" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "f_selection = SelectKBest(k=3).fit(X, y)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X_good = f_selection.transform(X)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.svm import LinearSVC" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid = GridSearchCV(LinearSVC(), param_grid={'C': 10. ** np.arange(-3 ,3)})" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid.fit(X_good, y)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid.best_params_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid.best_score_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }