{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**This Jupyter notebook contains the complimentary code for the Appendix section of the article \"Model evaluation, model selection, and algorithm selection in machine learning - Part IV\" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.**\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# A \"nested cross-validation for algorithm selection\" example using scikit-learn" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sebastian Raschka 2016-09-30 \n", "\n", "CPython 3.5.2\n", "IPython 5.1.0\n", "\n", "sklearn 0.17.1\n", "mlxtend 0.4.2\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.grid_search import GridSearchCV\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.svm import SVC\n", "from mlxtend.data import mnist_data\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.cross_validation import StratifiedKFold\n", "from sklearn.metrics import accuracy_score\n", "import random\n", "\n", "np.random.seed(1)\n", "random.seed(1)\n", "\n", "# Loading and splitting the dataset\n", "# Note that this is a small (stratified) subset\n", "# of MNIST; it consists of 5000 samples only, that is,\n", "# 10% of the original MNIST dataset\n", "# http://yann.lecun.com/exdb/mnist/\n", "X, y = mnist_data()\n", "X = X.astype(np.float32)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", " train_size=0.8,\n", " random_state=1,\n", " stratify=y)\n", "\n", "# Initializing Classifiers\n", "clf1 = LogisticRegression(multi_class='multinomial',\n", " solver='newton-cg',\n", " random_state=1)\n", "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n", " leaf_size=50)\n", "clf3 = DecisionTreeClassifier(random_state=1)\n", "clf4 = SVC(random_state=1)\n", "\n", "# Building the pipelines\n", "pipe1 = Pipeline([('std', StandardScaler()),\n", " ('clf1', clf1)])\n", "\n", "pipe2 = Pipeline([('std', StandardScaler()),\n", " ('clf2', clf2)])\n", "\n", "pipe4 = Pipeline([('std', StandardScaler()),\n", " ('clf4', clf4)])\n", "\n", "\n", "# Setting up the parameter grids\n", "param_grid1 = [{'clf1__penalty': ['l2'],\n", " 'clf1__C': np.power(10., np.arange(-4, 4))}]\n", "\n", "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n", " 'clf2__p': [1, 2]}]\n", "\n", "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n", " 'criterion': ['gini', 'entropy']}]\n", "\n", "param_grid4 = [{'clf4__kernel': ['rbf'],\n", " 'clf4__C': np.power(10., np.arange(-4, 4)),\n", " 'clf4__gamma': np.power(10., np.arange(-5, 0))},\n", " {'clf4__kernel': ['linear'],\n", " 'clf4__C': np.power(10., np.arange(-4, 4))}]\n", "\n", "\n", "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n", "gridcvs = {}\n", "\n", "for pgrid, est, name in zip((param_grid1, param_grid2,\n", " param_grid3, param_grid4),\n", " (pipe1, pipe2, clf3, pipe4),\n", " ('Softmax', 'KNN', 'DTree', 'SVM')):\n", " gcv = GridSearchCV(estimator=est,\n", " param_grid=pgrid,\n", " scoring='accuracy',\n", " n_jobs=1,\n", " cv=2,\n", " verbose=0,\n", " refit=True)\n", " gridcvs[name] = gcv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "outer fold 1/5 | tuning DTree | inner ACC 72.38% | outer ACC 81.25%\n", "outer fold 1/5 | tuning KNN | inner ACC 88.19% | outer ACC 90.62%\n", "outer fold 1/5 | tuning SVM | inner ACC 89.88% | outer ACC 92.62%\n", "outer fold 1/5 | tuning Softmax | inner ACC 88.22% | outer ACC 91.88%\n", "outer fold 2/5 | tuning DTree | inner ACC 75.16% | outer ACC 76.25%\n", "outer fold 2/5 | tuning KNN | inner ACC 88.62% | outer ACC 90.62%\n", "outer fold 2/5 | tuning SVM | inner ACC 90.84% | outer ACC 91.25%\n", "outer fold 2/5 | tuning Softmax | inner ACC 89.00% | outer ACC 90.62%\n", "outer fold 3/5 | tuning DTree | inner ACC 74.25% | outer ACC 78.75%\n", "outer fold 3/5 | tuning KNN | inner ACC 87.81% | outer ACC 93.00%\n", "outer fold 3/5 | tuning SVM | inner ACC 89.69% | outer ACC 92.12%\n", "outer fold 3/5 | tuning Softmax | inner ACC 89.03% | outer ACC 90.38%\n", "outer fold 4/5 | tuning DTree | inner ACC 75.03% | outer ACC 73.62%\n", "outer fold 4/5 | tuning KNN | inner ACC 88.88% | outer ACC 90.50%\n", "outer fold 4/5 | tuning SVM | inner ACC 90.78% | outer ACC 90.38%\n", "outer fold 4/5 | tuning Softmax | inner ACC 89.25% | outer ACC 86.50%\n", "outer fold 5/5 | tuning DTree | inner ACC 73.31% | outer ACC 76.25%\n", "outer fold 5/5 | tuning KNN | inner ACC 88.41% | outer ACC 90.88%\n", "outer fold 5/5 | tuning SVM | inner ACC 90.28% | outer ACC 93.00%\n", "outer fold 5/5 | tuning Softmax | inner ACC 88.16% | outer ACC 90.62%\n" ] } ], "source": [ "cv_scores = {name: [] for name, gs_est in gridcvs.items()}\n", "\n", "skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=1)\n", "\n", "# The outer loop for algorithm selection\n", "c = 1\n", "for outer_train_idx, outer_valid_idx in skfold:\n", " for name, gs_est in sorted(gridcvs.items()):\n", " print('outer fold %d/5 | tuning %-8s' % (c, name), end='')\n", "\n", " # The inner loop for hyperparameter tuning\n", " gs_est.fit(X_train[outer_train_idx], y_train[outer_train_idx])\n", " y_pred = gs_est.predict(X_train[outer_valid_idx])\n", " acc = accuracy_score(y_true=y_train[outer_valid_idx], y_pred=y_pred)\n", " print(' | inner ACC %.2f%% | outer ACC %.2f%%' %\n", " (gs_est.best_score_ * 100, acc * 100))\n", " cv_scores[name].append(acc)\n", "\n", " c += 1" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DTree | outer CV acc. 77.22% +\\- 2.584\n", "KNN | outer CV acc. 91.13% +\\- 0.945\n", "Softmax | outer CV acc. 90.00% +\\- 1.827\n", "SVM | outer CV acc. 91.88% +\\- 0.952\n", "\n", "SVM Best parameters {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n" ] } ], "source": [ "# Looking at the results\n", "for name in cv_scores:\n", " print('%-8s | outer CV acc. %.2f%% +\\- %.3f' % (\n", " name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))\n", "print('\\nSVM Best parameters', gridcvs['SVM'].best_params_)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy 90.80% (average over CV test folds)\n", "Best Parameters: {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n", "Training Accuracy: 99.92%\n", "Test Accuracy: 93.00%\n" ] } ], "source": [ "# Fitting a model to the whole training set\n", "# using the \"best\" algorithm\n", "best_algo = gridcvs['SVM']\n", "\n", "best_algo.fit(X_train, y_train)\n", "train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))\n", "test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))\n", "\n", "print('Accuracy %.2f%% (average over CV test folds)' %\n", " (100 * best_algo.best_score_))\n", "print('Best Parameters: %s' % gridcvs['SVM'].best_params_)\n", "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n", "print('Test Accuracy: %.2f%%' % (100 * test_acc))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Fitting a model to the whole dataset\n", "# using the \"best\" algorithm and hyperparameter settings\n", "best_clf = best_algo.best_estimator_\n", "final_model = best_clf.fit(X, y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Nested CV for algorithm selection in scikit-learn 0.18" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sebastian Raschka 2016-09-30 \n", "\n", "CPython 3.5.2\n", "IPython 5.1.0\n", "\n", "sklearn 0.18\n", "mlxtend 0.4.3dev0\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "There were a lot of neat changes introduced in [scikit-learn 0.18](http://scikit-learn.org/dev/whats_new.html), released on on 28 Sep, 2016, that make nested CV a lot more convenient. " ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.svm import SVC\n", "from mlxtend.data import mnist_data\n", "from sklearn.metrics import accuracy_score\n", "\n", "# Loading and splitting the dataset\n", "# Note that this is a small (stratified) subset\n", "# of MNIST; it consists of 5000 samples only, that is,\n", "# 10% of the original MNIST dataset\n", "# http://yann.lecun.com/exdb/mnist/\n", "X, y = mnist_data()\n", "X = X.astype(np.float32)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", " train_size=0.8,\n", " random_state=1,\n", " stratify=y)\n", "\n", "# Initializing Classifiers\n", "clf1 = LogisticRegression(multi_class='multinomial',\n", " solver='newton-cg',\n", " random_state=1)\n", "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n", " leaf_size=50)\n", "clf3 = DecisionTreeClassifier(random_state=1)\n", "clf4 = SVC(random_state=1)\n", "\n", "# Building the pipelines\n", "pipe1 = Pipeline([('std', StandardScaler()),\n", " ('clf1', clf1)])\n", "\n", "pipe2 = Pipeline([('std', StandardScaler()),\n", " ('clf2', clf2)])\n", "\n", "pipe4 = Pipeline([('std', StandardScaler()),\n", " ('clf4', clf4)])\n", "\n", "\n", "# Setting up the parameter grids\n", "param_grid1 = [{'clf1__penalty': ['l2'],\n", " 'clf1__C': np.power(10., np.arange(-4, 4))}]\n", "\n", "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n", " 'clf2__p': [1, 2]}]\n", "\n", "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n", " 'criterion': ['gini', 'entropy']}]\n", "\n", "param_grid4 = [{'clf4__kernel': ['rbf'],\n", " 'clf4__C': np.power(10., np.arange(-4, 4)),\n", " 'clf4__gamma': np.power(10., np.arange(-5, 0))},\n", " {'clf4__kernel': ['linear'],\n", " 'clf4__C': np.power(10., np.arange(-4, 4))}]\n", "\n", "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n", "gridcvs = {}\n", "inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n", "\n", "for pgrid, est, name in zip((param_grid1, param_grid2,\n", " param_grid3, param_grid4),\n", " (pipe1, pipe2, clf3, pipe4),\n", " ('Softmax', 'KNN', 'DTree', 'SVM')):\n", " gcv = GridSearchCV(estimator=est,\n", " param_grid=pgrid,\n", " scoring='accuracy',\n", " n_jobs=1,\n", " cv=inner_cv,\n", " verbose=0,\n", " refit=True)\n", " gridcvs[name] = gcv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DTree | outer ACC 77.33% +/- 2.72\n", "KNN | outer ACC 91.10% +/- 0.96\n", "SVM | outer ACC 91.95% +/- 1.04\n", "Softmax | outer ACC 90.32% +/- 1.22\n" ] } ], "source": [ "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n", "\n", "for name, gs_est in sorted(gridcvs.items()):\n", " nested_score = cross_val_score(gs_est, \n", " X=X_train, \n", " y=y_train, \n", " cv=outer_cv,\n", " n_jobs=1)\n", " print('%s | outer ACC %.2f%% +/- %.2f' % \n", " (name, nested_score.mean() * 100, nested_score.std() * 100))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy 91.03% (average over CV test folds)\n", "Best Parameters: {'clf4__C': 10.0, 'clf4__gamma': 0.001, 'clf4__kernel': 'rbf'}\n", "Training Accuracy: 99.92%\n", "Test Accuracy: 93.00%\n" ] } ], "source": [ "# Fitting a model to the whole training set\n", "# using the \"best\" algorithm\n", "best_algo = gridcvs['SVM']\n", "\n", "best_algo.fit(X_train, y_train)\n", "train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))\n", "test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))\n", "\n", "print('Accuracy %.2f%% (average over CV test folds)' %\n", " (100 * best_algo.best_score_))\n", "print('Best Parameters: %s' % gridcvs['SVM'].best_params_)\n", "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n", "print('Test Accuracy: %.2f%%' % (100 * test_acc))" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }