{ "metadata": { "name": "Sklearn-rf-tutorial" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "#IPython notebook for tutorial on Scikit-learn random forests\n", "#Created by Ryan Feather Nov 17, 2012 (ryan.feather@gmail.com)\n", "#Downloadable from http://featherconflagration.com/tutorials\n", "#For more information on scikit-learn, visit http://scikit-learn.org/stable/" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "#set up some utility routines for this example. You can safely ignore these for now\n", "import numpy as np\n", "\n", "#Sample a percentage of indexes from y in a manner that preserves the ratio of classes\n", "def stratifiedSample(y,percent):\n", " labels = np.unique(y)\n", " sample = []\n", " for label in labels:\n", " #find the current label\n", " currentIndexes = np.nonzero(y==label)[0]\n", " #randomize the sample\n", " np.random.shuffle(currentIndexes)\n", " sampleSize = int(percent*len(currentIndexes))\n", " #extend the current sample list\n", " sample += currentIndexes[0:sampleSize].tolist()\n", " #convert the sample to boolean indexes for convenience\n", " booleanSample = np.zeros((len(y),),dtype=bool)\n", " booleanSample[sample] = True\n", " return booleanSample" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "#load digits sample data set\n", "from sklearn.datasets import load_digits\n", "digits = load_digits()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "digits.data" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 5, "text": [ "array([[ 0., 0., 5., ..., 0., 0., 0.],\n", " [ 0., 0., 0., ..., 10., 0., 0.],\n", " [ 0., 0., 0., ..., 16., 9., 0.],\n", " ..., \n", " [ 0., 0., 1., ..., 6., 0., 0.],\n", " [ 0., 0., 2., ..., 12., 0., 0.],\n", " [ 0., 0., 10., ..., 12., 1., 0.]])" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "digits.data.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 6, "text": [ "(1797, 64)" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "digits.target" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 8, "text": [ "array([0, 1, 2, ..., 8, 9, 8])" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "digits.target.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 9, "text": [ "(1797,)" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "#visualize a digit\n", "from matplotlib import pyplot\n", "pyplot.gray()\n", "pyplot.matshow(digits.images[39]) \n", "pyplot.show()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": true, "input": [ "#set up test and training sets\n", "fullData = digits.data\n", "fullLabels = digits.target" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "#sample 25% for a test set\n", "testIndexes = stratifiedSample(fullLabels,0.25)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "testData = fullData[testIndexes,:]\n", "testLabels = fullLabels[testIndexes]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "testData.shape" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "trainData = fullData[~testIndexes,:]\n", "trainLabels = fullLabels[~testIndexes]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "trainData.shape" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": true, "input": [ "#load the random forest classifier library\n", "from sklearn.ensemble.forest import RandomForestClassifier\n", "#create a random forest\n", "forest = RandomForestClassifier()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "forest" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 18, "text": [ "RandomForestClassifier(bootstrap=True, compute_importances=False,\n", " criterion='gini', max_depth=None, max_features='auto',\n", " min_density=0.1, min_samples_leaf=1, min_samples_split=1,\n", " n_estimators=10, n_jobs=1, oob_score=False,\n", " random_state=,\n", " verbose=0)" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "#train the random forest on the training set\n", "forest.fit(trainData,trainLabels)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 19, "text": [ "RandomForestClassifier(bootstrap=True, compute_importances=False,\n", " criterion='gini', max_depth=None, max_features='auto',\n", " min_density=0.1, min_samples_leaf=1, min_samples_split=1,\n", " n_estimators=10, n_jobs=1, oob_score=False,\n", " random_state=,\n", " verbose=0)" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "#test the forest on the test set\n", "predictedLabels = forest.predict(testData)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "predictedLabels" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 21, "text": [ "array([4, 0, 6, 7, 5, 4, 9, 5, 5, 5, 8, 4, 0, 4, 6, 6, 0, 9, 3, 8, 1, 2, 1,\n", " 7, 4, 6, 1, 4, 5, 3, 8, 4, 2, 8, 7, 8, 6, 5, 8, 8, 7, 2, 2, 3, 7, 4,\n", " 6, 9, 1, 1, 5, 6, 7, 2, 5, 4, 0, 3, 4, 5, 6, 0, 8, 4, 5, 9, 0, 9, 8,\n", " 4, 7, 2, 3, 3, 3, 6, 7, 5, 1, 7, 1, 7, 5, 3, 7, 6, 2, 9, 5, 9, 3, 3,\n", " 4, 5, 8, 0, 5, 6, 9, 0, 8, 8, 7, 5, 2, 7, 2, 0, 3, 6, 9, 2, 0, 3, 2,\n", " 4, 6, 1, 9, 1, 6, 9, 7, 4, 2, 4, 9, 0, 3, 7, 9, 2, 5, 8, 1, 3, 5, 7,\n", " 9, 6, 7, 8, 2, 7, 3, 3, 6, 4, 9, 2, 0, 1, 1, 7, 1, 7, 8, 6, 5, 1, 8,\n", " 9, 0, 8, 9, 2, 4, 0, 8, 4, 3, 2, 3, 6, 9, 0, 9, 5, 0, 0, 7, 7, 1, 1,\n", " 6, 8, 7, 4, 5, 4, 8, 8, 1, 2, 5, 8, 9, 2, 6, 0, 2, 5, 8, 4, 5, 2, 1,\n", " 7, 6, 4, 9, 7, 1, 3, 4, 0, 5, 7, 5, 4, 5, 0, 8, 4, 6, 7, 9, 6, 9, 1,\n", " 8, 0, 9, 5, 5, 9, 8, 1, 2, 2, 0, 9, 5, 0, 2, 8, 6, 0, 1, 3, 1, 5, 4,\n", " 7, 2, 9, 0, 8, 7, 8, 6, 1, 2, 9, 0, 3, 6, 6, 0, 2, 7, 2, 1, 3, 3, 1,\n", " 3, 7, 4, 7, 2, 9, 8, 4, 0, 8, 8, 1, 2, 9, 5, 2, 3, 5, 0, 7, 2, 2, 9,\n", " 0, 1, 6, 3, 6, 5, 5, 3, 3, 8, 3, 5, 6, 9, 1, 8, 0, 3, 4, 7, 1, 8, 2,\n", " 4, 7, 9, 6, 0, 8, 3, 1, 0, 2, 9, 6, 9, 5, 9, 3, 1, 5, 3, 1, 4, 8, 2,\n", " 4, 0, 0, 3, 4, 5, 1, 0, 4, 3, 6, 7, 0, 5, 5, 7, 1, 2, 6, 6, 9, 6, 1,\n", " 3, 7, 4, 4, 8, 8, 2, 3, 7, 0, 7, 3, 9, 5, 4, 1, 7, 8, 0, 1, 6, 3, 3,\n", " 4, 6, 1, 5, 7, 0, 2, 7, 9, 3, 1, 3, 9, 4, 1, 4, 7, 1, 4, 0, 0, 6, 7,\n", " 1, 2, 4, 6, 1, 3, 5, 4, 7, 1, 2, 2, 6, 9, 3, 6, 0, 0, 0, 7, 6, 6, 3,\n", " 1, 9, 1, 6, 1, 1, 0, 7])" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "#compute the raw accuracy (same as forest.score())\n", "print np.sum(predictedLabels==testLabels)/float(testLabels.shape[0])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.937078651685\n" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "forest.score(testData,testLabels)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 25, "text": [ "0.93707865168539328" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "#try to autoselect the best number of trees\n", "#use stratified cv iterator to prevent \n", "from sklearn.cross_validation import StratifiedKFold\n", "stratifiedIterator = StratifiedKFold(testLabels,k=4)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "stratifiedIterator" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 31, "text": [ "sklearn.cross_validation.StratifiedKFold(labels=[4 0 6 7 5 7 9 5 5 9 8 4 0 4 6 6 0 9 5 8 2 2 1 7 4 6 1 4 5 3 8 4 2 8 7 8 6\n", " 5 8 8 7 2 2 3 7 4 6 9 1 1 5 6 7 2 5 4 0 3 4 5 6 0 8 4 5 9 0 9 8 4 7 1 3 3\n", " 3 6 9 5 1 9 1 7 5 3 9 6 2 9 5 8 3 3 4 5 8 0 5 6 9 0 8 8 7 5 2 7 2 0 3 6 9\n", " 2 0 3 2 4 6 1 9 1 6 9 7 4 2 4 9 0 3 7 9 2 5 8 1 3 5 7 9 6 7 8 2 7 3 3 6 4\n", " 9 2 0 1 1 7 1 7 8 6 5 8 8 9 0 8 9 2 4 0 8 4 3 2 3 6 9 0 9 5 0 0 7 7 1 1 6\n", " 8 4 9 5 4 8 8 1 2 5 8 9 2 6 0 2 5 9 4 5 2 1 7 6 4 9 7 1 3 4 0 5 7 5 4 5 0\n", " 8 4 6 7 9 6 9 1 8 0 9 5 5 9 8 1 2 2 0 9 5 0 2 8 6 3 1 3 1 5 4 7 2 9 0 8 7\n", " 8 6 1 2 9 0 3 6 6 0 2 7 2 1 3 9 1 3 7 4 7 2 9 8 4 0 8 8 1 2 9 5 2 3 5 0 7\n", " 2 2 8 0 1 6 3 6 5 5 3 3 8 3 5 6 9 1 8 0 3 4 7 1 8 2 4 7 9 6 0 8 3 1 0 2 2\n", " 6 9 5 9 3 1 5 3 1 4 8 2 4 9 0 3 4 5 1 0 4 5 6 7 0 5 5 7 1 2 6 6 9 6 1 3 7\n", " 4 4 8 8 2 3 7 0 2 3 9 5 8 1 7 7 0 1 6 3 3 4 6 4 5 8 0 2 7 4 3 1 3 9 4 1 4\n", " 7 4 4 0 0 6 7 1 2 4 6 1 3 5 4 7 1 2 2 6 3 3 6 0 0 0 7 6 6 3 1 9 1 6 3 1 0\n", " 7], k=4)" ] } ], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [ "#set up a grid search to select optimum number of trees\n", "from sklearn.grid_search import GridSearchCV\n", "params = {'n_estimators':range(10,101,10)}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "#create a parallel grid search that will search for the best parameter set\n", "gridSearch = GridSearchCV(forest,param_grid=params,n_jobs=-1, cv=stratifiedIterator)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#perform the search for the best parameters\n", "cvFitForest = gridSearch.fit(trainData,trainLabels)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print gridSearch.grid_scores_" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[({'n_estimators': 10}, 0.92577622265122272, array([ 0.95535714, 0.93693694, 0.93693694, 0.87387387])), ({'n_estimators': 20}, 0.95051077863577871, array([ 0.97321429, 0.95495495, 0.94594595, 0.92792793])), ({'n_estimators': 30}, 0.94825852638352637, array([ 0.97321429, 0.95495495, 0.93693694, 0.92792793])), ({'n_estimators': 40}, 0.95501528314028317, array([ 0.97321429, 0.96396396, 0.94594595, 0.93693694])), ({'n_estimators': 50}, 0.95499517374517373, array([ 0.98214286, 0.95495495, 0.94594595, 0.93693694])), ({'n_estimators': 60}, 0.95726753539253551, array([ 0.97321429, 0.96396396, 0.94594595, 0.94594595])), ({'n_estimators': 70}, 0.95503539253539249, array([ 0.96428571, 0.95495495, 0.95495495, 0.94594595])), ({'n_estimators': 80}, 0.95278314028314037, array([ 0.96428571, 0.95495495, 0.94594595, 0.94594595])), ({'n_estimators': 90}, 0.9550353925353926, array([ 0.96428571, 0.96396396, 0.94594595, 0.94594595])), ({'n_estimators': 100}, 0.9550353925353926, array([ 0.96428571, 0.95495495, 0.94594595, 0.95495495]))]\n" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "#take a look at the cv scores for one parameter set\n", "gridSearch.grid_scores_[0]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 35, "text": [ "({'n_estimators': 10},\n", " 0.92577622265122272,\n", " array([ 0.95535714, 0.93693694, 0.93693694, 0.87387387]))" ] } ], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "#extract interesting data\n", "estimator = [gridScore[0]['n_estimators'] for gridScore in gridSearch.grid_scores_]\n", "score = [gridScore[1] for gridScore in gridSearch.grid_scores_]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "estimator" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 37, "text": [ "[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]" ] } ], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "score" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 38, "text": [ "[0.92577622265122272,\n", " 0.95051077863577871,\n", " 0.94825852638352637,\n", " 0.95501528314028317,\n", " 0.95499517374517373,\n", " 0.95726753539253551,\n", " 0.95503539253539249,\n", " 0.95278314028314037,\n", " 0.9550353925353926,\n", " 0.9550353925353926]" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "#plot the cv score with respect to parameters\n", "pyplot.plot(estimator,score)\n", "pyplot.show()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 39 }, { "cell_type": "code", "collapsed": false, "input": [ "#compare to score on the test set\n", "print cvFitForest.score(testData,testLabels)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.961797752809\n" ] } ], "prompt_number": 40 } ], "metadata": {} } ] }