{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Model Evaluation\n",
      "==================\n",
      "\n",
      "Cross-validation\n",
      "-----------------"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.cross_validation import cross_val_score, train_test_split"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.datasets import load_digits"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "digits = load_digits()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.svm import SVC"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cross_val_score(SVC(C=1), X_train, y_train, cv=3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cross_val_score(SVC(C=10), X_train, y_train, cv=3, scoring=\"f1\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Let's go to a binary task for a moment (even vs uneven)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"average_precision\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"roc_auc\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "There are other ways to do cross-valiation"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.cross_validation import ShuffleSplit\n",
      "cross_val_score(SVC(C=10), X_train, y_train, cv=ShuffleSplit(len(X_train), 10, test_size=.4))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Model Selection\n",
      "================="
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Grid-Search with build-in cross validation"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.grid_search import GridSearchCV"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Define parameter grid:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma' : 10. ** np.arange(-3, 3)}\n",
      "print(param_grid)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid_search = GridSearchCV(SVC(), param_grid, verbose=3, n_jobs=3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "A GridSearchCV object behaves just like a normal classifier."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid_search.fit(X_train, y_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# We extract just the scores\n",
      "%matplotlib inline\n",
      "import matplotlib.pyplot as plt\n",
      "\n",
      "scores = [x[1] for x in grid_search.grid_scores_]\n",
      "scores = np.array(scores).reshape(6, 6)\n",
      "\n",
      "plt.matshow(scores)\n",
      "plt.xlabel('gamma')\n",
      "plt.ylabel('C')\n",
      "plt.colorbar()\n",
      "plt.xticks(np.arange(6), param_grid['gamma'])\n",
      "plt.yticks(np.arange(6), param_grid['C'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid_search.best_params_"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid_search.predict(X_test)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid_search.score(X_test, y_test)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Preprocessing and Pipelines\n",
      "============================="
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.preprocessing import StandardScaler"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Same interface as always."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "scaler = StandardScaler()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "scaler.fit(X_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "scaler.transform(X_train).mean(axis=0)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "scaler.transform(X_train).std(axis=0)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "For cross-validation, we need to estimate mean and standard deviation separately for each fold.\n",
      "To do that, we build a pipeline."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.pipeline import Pipeline"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pipeline = Pipeline([(\"scaler\", scaler), (\"svm\", SVC())])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pipeline.fit(X_train, y_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pipeline.predict(X_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Cross-validation with a pipeline\n",
      "---------------------------------"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cross_val_score(pipeline, X_train, y_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "So, yeah, don't forget the preprocessing."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "param_grid_pipeline = {'svm__C': 10. ** np.arange(-3, 3), 'svm__gamma' : 10. ** np.arange(-3, 3)}\n",
      "\n",
      "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid_pipeline, verbose=3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid_pipeline.fit(X_train, y_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# We extract just the scores\n",
      "scores = [x[1] for x in grid_pipeline.grid_scores_]\n",
      "scores = np.array(scores).reshape(6, 6)\n",
      "\n",
      "plt.matshow(scores)\n",
      "plt.xlabel('gamma')\n",
      "plt.ylabel('C')\n",
      "plt.colorbar()\n",
      "plt.xticks(np.arange(6), param_grid['gamma'])\n",
      "plt.yticks(np.arange(6), param_grid['C'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid_pipeline.score(X_test, y_test)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Randomized Searching\n",
      "======================"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.grid_search import RandomizedSearchCV"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from scipy.stats import expon"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "plt.hist([expon.rvs() for x in xrange(1000)])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "params = {'svm__C': expon(), 'svm__gamma': expon()}\n",
      "rs = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=50, verbose=3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "rs.fit(X_train, y_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "rs.best_params_"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "rs.best_score_"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "scores, Cs, gammas = zip(*[(score.mean_validation_score, score.parameters['svm__C'], score.parameters['svm__gamma']) for score in rs.grid_scores_])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "plt.scatter(Cs, gammas, s=40, c=scores)\n",
      "plt.xlabel(\"C\")\n",
      "plt.ylabel(\"gamma\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X = np.random.normal(size=(50, 40))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "y = np.random.randint(2, size=50)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "y"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.feature_selection import SelectKBest"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "f_selection = SelectKBest(k=3).fit(X, y)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_good = f_selection.transform(X)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.svm import LinearSVC"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid = GridSearchCV(LinearSVC(), param_grid={'C': 10. ** np.arange(-3 ,3)})"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid.fit(X_good, y)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid.best_params_"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid.best_score_"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}