{
 "metadata": {
  "name": "Sklearn-rf-tutorial"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#IPython notebook for tutorial on Scikit-learn random forests\n",
      "#Created by Ryan Feather Nov 17, 2012 (ryan.feather@gmail.com)\n",
      "#Downloadable from http://featherconflagration.com/tutorials\n",
      "#For more information on scikit-learn, visit http://scikit-learn.org/stable/"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#set up some utility routines for this example. You can safely ignore these for now\n",
      "import numpy as np\n",
      "\n",
      "#Sample a percentage of indexes from y in a manner that preserves the ratio of classes\n",
      "def stratifiedSample(y,percent):\n",
      "    labels = np.unique(y)\n",
      "    sample = []\n",
      "    for label in labels:\n",
      "        #find the current label\n",
      "        currentIndexes = np.nonzero(y==label)[0]\n",
      "        #randomize the sample\n",
      "        np.random.shuffle(currentIndexes)\n",
      "        sampleSize = int(percent*len(currentIndexes))\n",
      "        #extend the current sample list\n",
      "        sample += currentIndexes[0:sampleSize].tolist()\n",
      "    #convert the sample to boolean indexes for convenience\n",
      "    booleanSample = np.zeros((len(y),),dtype=bool)\n",
      "    booleanSample[sample] = True\n",
      "    return booleanSample"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#load digits sample data set\n",
      "from sklearn.datasets import load_digits\n",
      "digits = load_digits()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "digits.data"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "array([[  0.,   0.,   5., ...,   0.,   0.,   0.],\n",
        "       [  0.,   0.,   0., ...,  10.,   0.,   0.],\n",
        "       [  0.,   0.,   0., ...,  16.,   9.,   0.],\n",
        "       ..., \n",
        "       [  0.,   0.,   1., ...,   6.,   0.,   0.],\n",
        "       [  0.,   0.,   2., ...,  12.,   0.,   0.],\n",
        "       [  0.,   0.,  10., ...,  12.,   1.,   0.]])"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "digits.data.shape"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "(1797, 64)"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "digits.target"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "array([0, 1, 2, ..., 8, 9, 8])"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "digits.target.shape"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 9,
       "text": [
        "(1797,)"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#visualize a digit\n",
      "from  matplotlib import pyplot\n",
      "pyplot.gray()\n",
      "pyplot.matshow(digits.images[39]) \n",
      "pyplot.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "#set up test and training sets\n",
      "fullData = digits.data\n",
      "fullLabels = digits.target"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#sample 25% for a test set\n",
      "testIndexes = stratifiedSample(fullLabels,0.25)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "testData = fullData[testIndexes,:]\n",
      "testLabels = fullLabels[testIndexes]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "testData.shape"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "trainData = fullData[~testIndexes,:]\n",
      "trainLabels = fullLabels[~testIndexes]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "trainData.shape"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "#load the random forest classifier library\n",
      "from sklearn.ensemble.forest import RandomForestClassifier\n",
      "#create a random forest\n",
      "forest = RandomForestClassifier()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "forest"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 18,
       "text": [
        "RandomForestClassifier(bootstrap=True, compute_importances=False,\n",
        "            criterion='gini', max_depth=None, max_features='auto',\n",
        "            min_density=0.1, min_samples_leaf=1, min_samples_split=1,\n",
        "            n_estimators=10, n_jobs=1, oob_score=False,\n",
        "            random_state=<mtrand.RandomState object at 0x7fa36d10a3f0>,\n",
        "            verbose=0)"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#train the random forest on the training set\n",
      "forest.fit(trainData,trainLabels)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 19,
       "text": [
        "RandomForestClassifier(bootstrap=True, compute_importances=False,\n",
        "            criterion='gini', max_depth=None, max_features='auto',\n",
        "            min_density=0.1, min_samples_leaf=1, min_samples_split=1,\n",
        "            n_estimators=10, n_jobs=1, oob_score=False,\n",
        "            random_state=<mtrand.RandomState object at 0x7fa36d10a3f0>,\n",
        "            verbose=0)"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#test the forest on the test set\n",
      "predictedLabels = forest.predict(testData)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "predictedLabels"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 21,
       "text": [
        "array([4, 0, 6, 7, 5, 4, 9, 5, 5, 5, 8, 4, 0, 4, 6, 6, 0, 9, 3, 8, 1, 2, 1,\n",
        "       7, 4, 6, 1, 4, 5, 3, 8, 4, 2, 8, 7, 8, 6, 5, 8, 8, 7, 2, 2, 3, 7, 4,\n",
        "       6, 9, 1, 1, 5, 6, 7, 2, 5, 4, 0, 3, 4, 5, 6, 0, 8, 4, 5, 9, 0, 9, 8,\n",
        "       4, 7, 2, 3, 3, 3, 6, 7, 5, 1, 7, 1, 7, 5, 3, 7, 6, 2, 9, 5, 9, 3, 3,\n",
        "       4, 5, 8, 0, 5, 6, 9, 0, 8, 8, 7, 5, 2, 7, 2, 0, 3, 6, 9, 2, 0, 3, 2,\n",
        "       4, 6, 1, 9, 1, 6, 9, 7, 4, 2, 4, 9, 0, 3, 7, 9, 2, 5, 8, 1, 3, 5, 7,\n",
        "       9, 6, 7, 8, 2, 7, 3, 3, 6, 4, 9, 2, 0, 1, 1, 7, 1, 7, 8, 6, 5, 1, 8,\n",
        "       9, 0, 8, 9, 2, 4, 0, 8, 4, 3, 2, 3, 6, 9, 0, 9, 5, 0, 0, 7, 7, 1, 1,\n",
        "       6, 8, 7, 4, 5, 4, 8, 8, 1, 2, 5, 8, 9, 2, 6, 0, 2, 5, 8, 4, 5, 2, 1,\n",
        "       7, 6, 4, 9, 7, 1, 3, 4, 0, 5, 7, 5, 4, 5, 0, 8, 4, 6, 7, 9, 6, 9, 1,\n",
        "       8, 0, 9, 5, 5, 9, 8, 1, 2, 2, 0, 9, 5, 0, 2, 8, 6, 0, 1, 3, 1, 5, 4,\n",
        "       7, 2, 9, 0, 8, 7, 8, 6, 1, 2, 9, 0, 3, 6, 6, 0, 2, 7, 2, 1, 3, 3, 1,\n",
        "       3, 7, 4, 7, 2, 9, 8, 4, 0, 8, 8, 1, 2, 9, 5, 2, 3, 5, 0, 7, 2, 2, 9,\n",
        "       0, 1, 6, 3, 6, 5, 5, 3, 3, 8, 3, 5, 6, 9, 1, 8, 0, 3, 4, 7, 1, 8, 2,\n",
        "       4, 7, 9, 6, 0, 8, 3, 1, 0, 2, 9, 6, 9, 5, 9, 3, 1, 5, 3, 1, 4, 8, 2,\n",
        "       4, 0, 0, 3, 4, 5, 1, 0, 4, 3, 6, 7, 0, 5, 5, 7, 1, 2, 6, 6, 9, 6, 1,\n",
        "       3, 7, 4, 4, 8, 8, 2, 3, 7, 0, 7, 3, 9, 5, 4, 1, 7, 8, 0, 1, 6, 3, 3,\n",
        "       4, 6, 1, 5, 7, 0, 2, 7, 9, 3, 1, 3, 9, 4, 1, 4, 7, 1, 4, 0, 0, 6, 7,\n",
        "       1, 2, 4, 6, 1, 3, 5, 4, 7, 1, 2, 2, 6, 9, 3, 6, 0, 0, 0, 7, 6, 6, 3,\n",
        "       1, 9, 1, 6, 1, 1, 0, 7])"
       ]
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#compute the raw accuracy (same as forest.score())\n",
      "print np.sum(predictedLabels==testLabels)/float(testLabels.shape[0])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0.937078651685\n"
       ]
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "forest.score(testData,testLabels)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 25,
       "text": [
        "0.93707865168539328"
       ]
      }
     ],
     "prompt_number": 25
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#try to autoselect the best number of trees\n",
      "#use stratified cv iterator to prevent \n",
      "from sklearn.cross_validation import StratifiedKFold\n",
      "stratifiedIterator = StratifiedKFold(testLabels,k=4)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 30
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "stratifiedIterator"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 31,
       "text": [
        "sklearn.cross_validation.StratifiedKFold(labels=[4 0 6 7 5 7 9 5 5 9 8 4 0 4 6 6 0 9 5 8 2 2 1 7 4 6 1 4 5 3 8 4 2 8 7 8 6\n",
        " 5 8 8 7 2 2 3 7 4 6 9 1 1 5 6 7 2 5 4 0 3 4 5 6 0 8 4 5 9 0 9 8 4 7 1 3 3\n",
        " 3 6 9 5 1 9 1 7 5 3 9 6 2 9 5 8 3 3 4 5 8 0 5 6 9 0 8 8 7 5 2 7 2 0 3 6 9\n",
        " 2 0 3 2 4 6 1 9 1 6 9 7 4 2 4 9 0 3 7 9 2 5 8 1 3 5 7 9 6 7 8 2 7 3 3 6 4\n",
        " 9 2 0 1 1 7 1 7 8 6 5 8 8 9 0 8 9 2 4 0 8 4 3 2 3 6 9 0 9 5 0 0 7 7 1 1 6\n",
        " 8 4 9 5 4 8 8 1 2 5 8 9 2 6 0 2 5 9 4 5 2 1 7 6 4 9 7 1 3 4 0 5 7 5 4 5 0\n",
        " 8 4 6 7 9 6 9 1 8 0 9 5 5 9 8 1 2 2 0 9 5 0 2 8 6 3 1 3 1 5 4 7 2 9 0 8 7\n",
        " 8 6 1 2 9 0 3 6 6 0 2 7 2 1 3 9 1 3 7 4 7 2 9 8 4 0 8 8 1 2 9 5 2 3 5 0 7\n",
        " 2 2 8 0 1 6 3 6 5 5 3 3 8 3 5 6 9 1 8 0 3 4 7 1 8 2 4 7 9 6 0 8 3 1 0 2 2\n",
        " 6 9 5 9 3 1 5 3 1 4 8 2 4 9 0 3 4 5 1 0 4 5 6 7 0 5 5 7 1 2 6 6 9 6 1 3 7\n",
        " 4 4 8 8 2 3 7 0 2 3 9 5 8 1 7 7 0 1 6 3 3 4 6 4 5 8 0 2 7 4 3 1 3 9 4 1 4\n",
        " 7 4 4 0 0 6 7 1 2 4 6 1 3 5 4 7 1 2 2 6 3 3 6 0 0 0 7 6 6 3 1 9 1 6 3 1 0\n",
        " 7], k=4)"
       ]
      }
     ],
     "prompt_number": 31
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#set up a grid search to select optimum number of trees\n",
      "from sklearn.grid_search import GridSearchCV\n",
      "params = {'n_estimators':range(10,101,10)}"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 32
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#create a parallel grid search that will search for the best parameter set\n",
      "gridSearch = GridSearchCV(forest,param_grid=params,n_jobs=-1, cv=stratifiedIterator)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#perform the search for the best parameters\n",
      "cvFitForest = gridSearch.fit(trainData,trainLabels)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print gridSearch.grid_scores_"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[({'n_estimators': 10}, 0.92577622265122272, array([ 0.95535714,  0.93693694,  0.93693694,  0.87387387])), ({'n_estimators': 20}, 0.95051077863577871, array([ 0.97321429,  0.95495495,  0.94594595,  0.92792793])), ({'n_estimators': 30}, 0.94825852638352637, array([ 0.97321429,  0.95495495,  0.93693694,  0.92792793])), ({'n_estimators': 40}, 0.95501528314028317, array([ 0.97321429,  0.96396396,  0.94594595,  0.93693694])), ({'n_estimators': 50}, 0.95499517374517373, array([ 0.98214286,  0.95495495,  0.94594595,  0.93693694])), ({'n_estimators': 60}, 0.95726753539253551, array([ 0.97321429,  0.96396396,  0.94594595,  0.94594595])), ({'n_estimators': 70}, 0.95503539253539249, array([ 0.96428571,  0.95495495,  0.95495495,  0.94594595])), ({'n_estimators': 80}, 0.95278314028314037, array([ 0.96428571,  0.95495495,  0.94594595,  0.94594595])), ({'n_estimators': 90}, 0.9550353925353926, array([ 0.96428571,  0.96396396,  0.94594595,  0.94594595])), ({'n_estimators': 100}, 0.9550353925353926, array([ 0.96428571,  0.95495495,  0.94594595,  0.95495495]))]\n"
       ]
      }
     ],
     "prompt_number": 34
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#take a look at the cv scores for one parameter set\n",
      "gridSearch.grid_scores_[0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 35,
       "text": [
        "({'n_estimators': 10},\n",
        " 0.92577622265122272,\n",
        " array([ 0.95535714,  0.93693694,  0.93693694,  0.87387387]))"
       ]
      }
     ],
     "prompt_number": 35
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#extract interesting data\n",
      "estimator = [gridScore[0]['n_estimators'] for gridScore in gridSearch.grid_scores_]\n",
      "score = [gridScore[1] for gridScore in gridSearch.grid_scores_]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 36
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "estimator"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 37,
       "text": [
        "[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]"
       ]
      }
     ],
     "prompt_number": 37
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "score"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 38,
       "text": [
        "[0.92577622265122272,\n",
        " 0.95051077863577871,\n",
        " 0.94825852638352637,\n",
        " 0.95501528314028317,\n",
        " 0.95499517374517373,\n",
        " 0.95726753539253551,\n",
        " 0.95503539253539249,\n",
        " 0.95278314028314037,\n",
        " 0.9550353925353926,\n",
        " 0.9550353925353926]"
       ]
      }
     ],
     "prompt_number": 38
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#plot the cv score with respect to parameters\n",
      "pyplot.plot(estimator,score)\n",
      "pyplot.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 39
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#compare to score on the test set\n",
      "print cvFitForest.score(testData,testLabels)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0.961797752809\n"
       ]
      }
     ],
     "prompt_number": 40
    }
   ],
   "metadata": {}
  }
 ]
}