{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "Cross-validation\n", "===================\n", "What is cross-validation?\n", "--------------------------\n", "* A robust way to evaluate predictive accuracy.\n", "* Gives mean and standard deviation.\n", "* Makes good use of all the data." ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cross_validation import KFold\n", "n_samples = 200\n", "cv = KFold(n=n_samples, n_folds=5)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "for training_set, test_set in cv:\n", " plt.figure(figsize=(20,1))\n", " plt.plot(training_set, np.ones(len(training_set)), \"o\", color='blue', label=\"training set\")\n", " plt.plot(test_set, np.ones(len(test_set)), \"o\", color='red', label=\"test set\")\n", " plt.legend(loc=\"best\")\n", " plt.axis(\"off\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using cross-validation in scikit-learn\n", "----------------------------------------" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cross_validation import cross_val_score, train_test_split" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.datasets import load_digits" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "digits = load_digits()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.svm import SVC" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=1), X_train, y_train, cv=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=10), X_train, y_train, cv=3, scoring=\"f1\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's go to a binary task for a moment (even vs uneven)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"average_precision\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"roc_auc\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are other ways to do cross-valiation" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cross_validation import ShuffleSplit\n", "cross_val_score(SVC(C=10), X_train, y_train, cv=ShuffleSplit(len(X_train), 10, test_size=.4))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tasks\n", "======\n", "1. Select a good ``gamma`` and ``C`` for SVC on ``digits`` using cross-validation.\n", "2. Validate your findings on the test set." ] } ], "metadata": {} } ] }