{ "metadata": { "name": "quiz" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas.rpy.common as com" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "SAheart = com.load_data('SAheart', package='ElemStatLearn')\n", "SAheart.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sbptobaccoldladiposityfamhisttypeaobesityalcoholagechd
1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 1
2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 1
3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 0
4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 1
5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 1
\n", "
" ], "output_type": "pyout", "prompt_number": 11, "text": [ " sbp tobacco ldl adiposity famhist typea obesity alcohol age chd\n", "1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 1\n", "2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 1\n", "3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 0\n", "4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 1\n", "5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 1" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "%load_ext rmagic" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "%%R -o train\n", "set.seed(8484)\n", "train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "trainSA = SAheart.ix[train,:]\n", "test = filter(lambda x: x not in train, SAheart.index)\n", "testSA = SAheart.ix[test, :]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "from statsmodels.formula.api import glm\n", "from statsmodels.api import families as f" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "lm = glm('chd ~ age + alcohol + obesity + tobacco + typea + ldl', trainSA, family=f.Binomial()).fit()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 39 }, { "cell_type": "code", "collapsed": false, "input": [ "def missClass(values, prediction):\n", " return float( sum( ((prediction > 0.5) * 1) != values ) ) / float(len(values))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "print 'training set misclassification %.4f' % missClass(trainSA['chd'], lm.fittedvalues)\n", "print 'test set misclassification %.4f' % missClass(testSA['chd'], lm.predict(testSA.ix[:,:-1]))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "training set misclassification 0.2597\n", "test set misclassification 0.3203\n" ] } ], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "olive = com.load_data('olive', package='pgmm')\n", "olive = olive.ix[:,:-1]\n", "olive.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RegionAreaPalmiticPalmitoleicStearicOleicLinoleicLinolenicArachidic
1 1 1 1075 75 226 7823 672 36 60
2 1 1 1088 73 224 7709 781 31 61
3 1 1 911 54 246 8113 549 31 63
4 1 1 966 57 240 7952 619 50 78
5 1 1 1051 67 259 7771 672 50 80
\n", "
" ], "output_type": "pyout", "prompt_number": 56, "text": [ " Region Area Palmitic Palmitoleic Stearic Oleic Linoleic Linolenic Arachidic\n", "1 1 1 1075 75 226 7823 672 36 60\n", "2 1 1 1088 73 224 7709 781 31 61\n", "3 1 1 911 54 246 8113 549 31 63\n", "4 1 1 966 57 240 7952 619 50 78\n", "5 1 1 1051 67 259 7771 672 50 80" ] } ], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn import tree\n", "import patsy as pt\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 60 }, { "cell_type": "code", "collapsed": false, "input": [ "y, X = pt.dmatrices('Area ~ Region + Palmitic + Palmitoleic + Stearic + \\\n", " Oleic + Linoleic + Linolenic + Arachidic - 1', olive)\n", "\n", "clf = tree.DecisionTreeClassifier().fit(X, y)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 96 }, { "cell_type": "code", "collapsed": false, "input": [ "import StringIO, pydot\n", "from IPython.core.display import HTML\n", "\n", "dot_data = StringIO.StringIO()\n", "tree.export_graphviz(clf, out_file=dot_data)\n", "graph = pydot.graph_from_dot_data(dot_data.getvalue())\n", "graph.write_png('tree3.png')\n", "#HTML('')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 97, "text": [ "True" ] } ], "prompt_number": 97 }, { "cell_type": "code", "collapsed": false, "input": [ "olive.mean()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 104, "text": [ "Region 1.699301\n", "Area 4.599650\n", "Palmitic 1231.741259\n", "Palmitoleic 126.094406\n", "Stearic 228.865385\n", "Oleic 7311.748252\n", "Linoleic 980.527972\n", "Linolenic 31.888112\n", "Arachidic 58.097902" ] } ], "prompt_number": 104 }, { "cell_type": "code", "collapsed": false, "input": [ "newdata = olive[['Region', 'Palmitic', 'Palmitoleic', 'Stearic', 'Oleic', 'Linoleic', 'Linolenic', 'Arachidic']].mean()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 111 }, { "cell_type": "code", "collapsed": false, "input": [ "clf.predict_proba(newdata)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 112, "text": [ "array([[ 0., 1., 0., 0., 0., 0., 0., 0., 0.]])" ] } ], "prompt_number": 112 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }