{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**This Jupyter notebook contains the complimentary code for the Appendix section of the article \"Model evaluation, model selection, and algorithm selection in machine learning - Part IV\" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.**\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# A \"nested cross-validation for algorithm selection\" example using scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sebastian Raschka 2016-09-30 \n",
      "\n",
      "CPython 3.5.2\n",
      "IPython 5.1.0\n",
      "\n",
      "sklearn 0.17.1\n",
      "mlxtend 0.4.2\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.svm import SVC\n",
    "from mlxtend.data import mnist_data\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.cross_validation import StratifiedKFold\n",
    "from sklearn.metrics import accuracy_score\n",
    "import random\n",
    "\n",
    "np.random.seed(1)\n",
    "random.seed(1)\n",
    "\n",
    "# Loading and splitting the dataset\n",
    "# Note that this is a small (stratified) subset\n",
    "# of MNIST; it consists of 5000 samples only, that is,\n",
    "# 10% of the original MNIST dataset\n",
    "# http://yann.lecun.com/exdb/mnist/\n",
    "X, y = mnist_data()\n",
    "X = X.astype(np.float32)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
    "                                                    train_size=0.8,\n",
    "                                                    random_state=1,\n",
    "                                                    stratify=y)\n",
    "\n",
    "# Initializing Classifiers\n",
    "clf1 = LogisticRegression(multi_class='multinomial',\n",
    "                          solver='newton-cg',\n",
    "                          random_state=1)\n",
    "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n",
    "                            leaf_size=50)\n",
    "clf3 = DecisionTreeClassifier(random_state=1)\n",
    "clf4 = SVC(random_state=1)\n",
    "\n",
    "# Building the pipelines\n",
    "pipe1 = Pipeline([('std', StandardScaler()),\n",
    "                  ('clf1', clf1)])\n",
    "\n",
    "pipe2 = Pipeline([('std', StandardScaler()),\n",
    "                  ('clf2', clf2)])\n",
    "\n",
    "pipe4 = Pipeline([('std', StandardScaler()),\n",
    "                  ('clf4', clf4)])\n",
    "\n",
    "\n",
    "# Setting up the parameter grids\n",
    "param_grid1 = [{'clf1__penalty': ['l2'],\n",
    "                'clf1__C': np.power(10., np.arange(-4, 4))}]\n",
    "\n",
    "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n",
    "                'clf2__p': [1, 2]}]\n",
    "\n",
    "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n",
    "                'criterion': ['gini', 'entropy']}]\n",
    "\n",
    "param_grid4 = [{'clf4__kernel': ['rbf'],\n",
    "                'clf4__C': np.power(10., np.arange(-4, 4)),\n",
    "                'clf4__gamma': np.power(10., np.arange(-5, 0))},\n",
    "               {'clf4__kernel': ['linear'],\n",
    "                'clf4__C': np.power(10., np.arange(-4, 4))}]\n",
    "\n",
    "\n",
    "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n",
    "gridcvs = {}\n",
    "\n",
    "for pgrid, est, name in zip((param_grid1, param_grid2,\n",
    "                             param_grid3, param_grid4),\n",
    "                            (pipe1, pipe2, clf3, pipe4),\n",
    "                            ('Softmax', 'KNN', 'DTree', 'SVM')):\n",
    "    gcv = GridSearchCV(estimator=est,\n",
    "                       param_grid=pgrid,\n",
    "                       scoring='accuracy',\n",
    "                       n_jobs=1,\n",
    "                       cv=2,\n",
    "                       verbose=0,\n",
    "                       refit=True)\n",
    "    gridcvs[name] = gcv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "outer fold 1/5 | tuning DTree    | inner ACC 72.38% | outer ACC 81.25%\n",
      "outer fold 1/5 | tuning KNN      | inner ACC 88.19% | outer ACC 90.62%\n",
      "outer fold 1/5 | tuning SVM      | inner ACC 89.88% | outer ACC 92.62%\n",
      "outer fold 1/5 | tuning Softmax  | inner ACC 88.22% | outer ACC 91.88%\n",
      "outer fold 2/5 | tuning DTree    | inner ACC 75.16% | outer ACC 76.25%\n",
      "outer fold 2/5 | tuning KNN      | inner ACC 88.62% | outer ACC 90.62%\n",
      "outer fold 2/5 | tuning SVM      | inner ACC 90.84% | outer ACC 91.25%\n",
      "outer fold 2/5 | tuning Softmax  | inner ACC 89.00% | outer ACC 90.62%\n",
      "outer fold 3/5 | tuning DTree    | inner ACC 74.25% | outer ACC 78.75%\n",
      "outer fold 3/5 | tuning KNN      | inner ACC 87.81% | outer ACC 93.00%\n",
      "outer fold 3/5 | tuning SVM      | inner ACC 89.69% | outer ACC 92.12%\n",
      "outer fold 3/5 | tuning Softmax  | inner ACC 89.03% | outer ACC 90.38%\n",
      "outer fold 4/5 | tuning DTree    | inner ACC 75.03% | outer ACC 73.62%\n",
      "outer fold 4/5 | tuning KNN      | inner ACC 88.88% | outer ACC 90.50%\n",
      "outer fold 4/5 | tuning SVM      | inner ACC 90.78% | outer ACC 90.38%\n",
      "outer fold 4/5 | tuning Softmax  | inner ACC 89.25% | outer ACC 86.50%\n",
      "outer fold 5/5 | tuning DTree    | inner ACC 73.31% | outer ACC 76.25%\n",
      "outer fold 5/5 | tuning KNN      | inner ACC 88.41% | outer ACC 90.88%\n",
      "outer fold 5/5 | tuning SVM      | inner ACC 90.28% | outer ACC 93.00%\n",
      "outer fold 5/5 | tuning Softmax  | inner ACC 88.16% | outer ACC 90.62%\n"
     ]
    }
   ],
   "source": [
    "cv_scores = {name: [] for name, gs_est in gridcvs.items()}\n",
    "\n",
    "skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=1)\n",
    "\n",
    "# The outer loop for algorithm selection\n",
    "c = 1\n",
    "for outer_train_idx, outer_valid_idx in skfold:\n",
    "    for name, gs_est in sorted(gridcvs.items()):\n",
    "        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')\n",
    "\n",
    "        # The inner loop for hyperparameter tuning\n",
    "        gs_est.fit(X_train[outer_train_idx], y_train[outer_train_idx])\n",
    "        y_pred = gs_est.predict(X_train[outer_valid_idx])\n",
    "        acc = accuracy_score(y_true=y_train[outer_valid_idx], y_pred=y_pred)\n",
    "        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %\n",
    "              (gs_est.best_score_ * 100, acc * 100))\n",
    "        cv_scores[name].append(acc)\n",
    "\n",
    "    c += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DTree    | outer CV acc. 77.22% +\\- 2.584\n",
      "KNN      | outer CV acc. 91.13% +\\- 0.945\n",
      "Softmax  | outer CV acc. 90.00% +\\- 1.827\n",
      "SVM      | outer CV acc. 91.88% +\\- 0.952\n",
      "\n",
      "SVM Best parameters {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n"
     ]
    }
   ],
   "source": [
    "# Looking at the results\n",
    "for name in cv_scores:\n",
    "    print('%-8s | outer CV acc. %.2f%% +\\- %.3f' % (\n",
    "          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))\n",
    "print('\\nSVM Best parameters', gridcvs['SVM'].best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy 90.80% (average over CV test folds)\n",
      "Best Parameters: {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n",
      "Training Accuracy: 99.92%\n",
      "Test Accuracy: 93.00%\n"
     ]
    }
   ],
   "source": [
    "# Fitting a model to the whole training set\n",
    "# using the \"best\" algorithm\n",
    "best_algo = gridcvs['SVM']\n",
    "\n",
    "best_algo.fit(X_train, y_train)\n",
    "train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))\n",
    "test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))\n",
    "\n",
    "print('Accuracy %.2f%% (average over CV test folds)' %\n",
    "      (100 * best_algo.best_score_))\n",
    "print('Best Parameters: %s' % gridcvs['SVM'].best_params_)\n",
    "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n",
    "print('Test Accuracy: %.2f%%' % (100 * test_acc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Fitting a model to the whole dataset\n",
    "# using the \"best\" algorithm and hyperparameter settings\n",
    "best_clf = best_algo.best_estimator_\n",
    "final_model = best_clf.fit(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Nested CV for algorithm selection in scikit-learn 0.18"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sebastian Raschka 2016-09-30 \n",
      "\n",
      "CPython 3.5.2\n",
      "IPython 5.1.0\n",
      "\n",
      "sklearn 0.18\n",
      "mlxtend 0.4.3dev0\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "There were a lot of neat changes introduced in [scikit-learn 0.18](http://scikit-learn.org/dev/whats_new.html), released on on 28 Sep, 2016, that make nested CV a lot more convenient. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.svm import SVC\n",
    "from mlxtend.data import mnist_data\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "# Loading and splitting the dataset\n",
    "# Note that this is a small (stratified) subset\n",
    "# of MNIST; it consists of 5000 samples only, that is,\n",
    "# 10% of the original MNIST dataset\n",
    "# http://yann.lecun.com/exdb/mnist/\n",
    "X, y = mnist_data()\n",
    "X = X.astype(np.float32)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
    "                                                    train_size=0.8,\n",
    "                                                    random_state=1,\n",
    "                                                    stratify=y)\n",
    "\n",
    "# Initializing Classifiers\n",
    "clf1 = LogisticRegression(multi_class='multinomial',\n",
    "                          solver='newton-cg',\n",
    "                          random_state=1)\n",
    "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n",
    "                            leaf_size=50)\n",
    "clf3 = DecisionTreeClassifier(random_state=1)\n",
    "clf4 = SVC(random_state=1)\n",
    "\n",
    "# Building the pipelines\n",
    "pipe1 = Pipeline([('std', StandardScaler()),\n",
    "                  ('clf1', clf1)])\n",
    "\n",
    "pipe2 = Pipeline([('std', StandardScaler()),\n",
    "                  ('clf2', clf2)])\n",
    "\n",
    "pipe4 = Pipeline([('std', StandardScaler()),\n",
    "                  ('clf4', clf4)])\n",
    "\n",
    "\n",
    "# Setting up the parameter grids\n",
    "param_grid1 = [{'clf1__penalty': ['l2'],\n",
    "                'clf1__C': np.power(10., np.arange(-4, 4))}]\n",
    "\n",
    "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n",
    "                'clf2__p': [1, 2]}]\n",
    "\n",
    "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n",
    "                'criterion': ['gini', 'entropy']}]\n",
    "\n",
    "param_grid4 = [{'clf4__kernel': ['rbf'],\n",
    "                'clf4__C': np.power(10., np.arange(-4, 4)),\n",
    "                'clf4__gamma': np.power(10., np.arange(-5, 0))},\n",
    "               {'clf4__kernel': ['linear'],\n",
    "                'clf4__C': np.power(10., np.arange(-4, 4))}]\n",
    "\n",
    "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n",
    "gridcvs = {}\n",
    "inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n",
    "\n",
    "for pgrid, est, name in zip((param_grid1, param_grid2,\n",
    "                             param_grid3, param_grid4),\n",
    "                            (pipe1, pipe2, clf3, pipe4),\n",
    "                            ('Softmax', 'KNN', 'DTree', 'SVM')):\n",
    "    gcv = GridSearchCV(estimator=est,\n",
    "                       param_grid=pgrid,\n",
    "                       scoring='accuracy',\n",
    "                       n_jobs=1,\n",
    "                       cv=inner_cv,\n",
    "                       verbose=0,\n",
    "                       refit=True)\n",
    "    gridcvs[name] = gcv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DTree | outer ACC 77.33% +/- 2.72\n",
      "KNN | outer ACC 91.10% +/- 0.96\n",
      "SVM | outer ACC 91.95% +/- 1.04\n",
      "Softmax | outer ACC 90.32% +/- 1.22\n"
     ]
    }
   ],
   "source": [
    "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n",
    "\n",
    "for name, gs_est in sorted(gridcvs.items()):\n",
    "    nested_score = cross_val_score(gs_est, \n",
    "                                   X=X_train, \n",
    "                                   y=y_train, \n",
    "                                   cv=outer_cv,\n",
    "                                   n_jobs=1)\n",
    "    print('%s | outer ACC %.2f%% +/- %.2f' % \n",
    "          (name, nested_score.mean() * 100, nested_score.std() * 100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy 91.03% (average over CV test folds)\n",
      "Best Parameters: {'clf4__C': 10.0, 'clf4__gamma': 0.001, 'clf4__kernel': 'rbf'}\n",
      "Training Accuracy: 99.92%\n",
      "Test Accuracy: 93.00%\n"
     ]
    }
   ],
   "source": [
    "# Fitting a model to the whole training set\n",
    "# using the \"best\" algorithm\n",
    "best_algo = gridcvs['SVM']\n",
    "\n",
    "best_algo.fit(X_train, y_train)\n",
    "train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))\n",
    "test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))\n",
    "\n",
    "print('Accuracy %.2f%% (average over CV test folds)' %\n",
    "      (100 * best_algo.best_score_))\n",
    "print('Best Parameters: %s' % gridcvs['SVM'].best_params_)\n",
    "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n",
    "print('Test Accuracy: %.2f%%' % (100 * test_acc))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}