{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "using DataFrames, RDatasets" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
PharvisLnhhExpAgeSexMarriedEducIllnessInjuryIlldaysActdaysInsuranceCommune
102.7303633.7612male1210700192
202.7372482.944439female0010400167
302.2669352.56495male040000176
412.3927533.637586female1310301123
513.1053353.295837male13101000148
603.7608843.367296male190000120
703.1556093.663562female120000140
803.7246822.197225male050000157
922.8616913.7612female122040049
1032.6150774.234107male1010700170
1112.6532432.772589male041010040
1212.1398573.663562female1220500127
1322.6256833.555348female1310300106
1412.7677461.94591female0520300168
1502.8712422.302585male0000000185
1621.9838113.135494male1430100041
1701.2602013.091043male0400000106
1802.2981783.401197male1200000141
1932.1325083.332205male1430100061
2002.0002313.258096male1300000159
2102.0183753.218876male1610700121
2211.8861422.639057female021030056
23102.9531251.098612female022040034
2402.3784813.526361male1000000127
2501.5743763.555348male1200000135
2632.4955143.637586female1310500147
2712.3180773.7612male141050094
2802.0290454.248495female13103000125
2911.7887543.610918female133030079
3002.0911072.079442female0000000143
" ], "text/plain": [ "27765x12 DataFrame\n", "| Row | Pharvis | LnhhExp | Age | Sex | Married | Educ | Illness |\n", "|-------|---------|---------|---------|----------|---------|------|---------|\n", "| 1 | 0 | 2.73036 | 3.7612 | \"male\" | 1 | 2 | 1 |\n", "| 2 | 0 | 2.73725 | 2.94444 | \"female\" | 0 | 0 | 1 |\n", "| 3 | 0 | 2.26694 | 2.56495 | \"male\" | 0 | 4 | 0 |\n", "| 4 | 1 | 2.39275 | 3.63759 | \"female\" | 1 | 3 | 1 |\n", "| 5 | 1 | 3.10534 | 3.29584 | \"male\" | 1 | 3 | 1 |\n", "| 6 | 0 | 3.76088 | 3.3673 | \"male\" | 1 | 9 | 0 |\n", "| 7 | 0 | 3.15561 | 3.66356 | \"female\" | 1 | 2 | 0 |\n", "| 8 | 0 | 3.72468 | 2.19722 | \"male\" | 0 | 5 | 0 |\n", "| 9 | 2 | 2.86169 | 3.7612 | \"female\" | 1 | 2 | 2 |\n", "| 10 | 3 | 2.61508 | 4.23411 | \"male\" | 1 | 0 | 1 |\n", "| 11 | 1 | 2.65324 | 2.77259 | \"male\" | 0 | 4 | 1 |\n", "⋮\n", "| 27754 | 1 | 3.22879 | 3.98898 | \"female\" | 1 | 9 | 1 |\n", "| 27755 | 0 | 2.60798 | 3.82864 | \"male\" | 1 | 2 | 1 |\n", "| 27756 | 0 | 2.23453 | 3.97029 | \"female\" | 1 | 3 | 0 |\n", "| 27757 | 0 | 1.83282 | 3.61092 | \"male\" | 1 | 0 | 1 |\n", "| 27758 | 0 | 1.67896 | 4.15888 | \"male\" | 1 | 2 | 0 |\n", "| 27759 | 0 | 2.28975 | 2.89037 | \"male\" | 0 | 0 | 0 |\n", "| 27760 | 0 | 2.15735 | 4.00733 | \"female\" | 1 | 4 | 1 |\n", "| 27761 | 0 | 1.84729 | 1.60944 | \"female\" | 0 | 5 | 2 |\n", "| 27762 | 0 | 2.46146 | 2.83321 | \"female\" | 0 | 6 | 0 |\n", "| 27763 | 0 | 2.46026 | 2.56495 | \"female\" | 0 | 5 | 0 |\n", "| 27764 | 0 | 1.92017 | 4.00733 | \"female\" | 1 | 4 | 2 |\n", "| 27765 | 3 | 2.46883 | 3.13549 | \"male\" | 0 | 3 | 2 |\n", "\n", "| Row | Injury | Illdays | Actdays | Insurance | Commune |\n", "|-------|--------|---------|---------|-----------|---------|\n", "| 1 | 0 | 7 | 0 | 0 | 192 |\n", "| 2 | 0 | 4 | 0 | 0 | 167 |\n", "| 3 | 0 | 0 | 0 | 1 | 76 |\n", "| 4 | 0 | 3 | 0 | 1 | 123 |\n", "| 5 | 0 | 10 | 0 | 0 | 148 |\n", "| 6 | 0 | 0 | 0 | 1 | 20 |\n", "| 7 | 0 | 0 | 0 | 1 | 40 |\n", "| 8 | 0 | 0 | 0 | 1 | 57 |\n", "| 9 | 0 | 4 | 0 | 0 | 49 |\n", "| 10 | 0 | 7 | 0 | 0 | 170 |\n", "| 11 | 0 | 1 | 0 | 0 | 40 |\n", "⋮\n", "| 27754 | 0 | 3 | 0 | 0 | 86 |\n", "| 27755 | 0 | 10 | 0 | 0 | 90 |\n", "| 27756 | 0 | 0 | 0 | 0 | 90 |\n", "| 27757 | 0 | 1 | 0 | 0 | 90 |\n", "| 27758 | 0 | 0 | 0 | 0 | 90 |\n", "| 27759 | 0 | 0 | 0 | 0 | 91 |\n", "| 27760 | 0 | 30 | 0 | 1 | 108 |\n", "| 27761 | 0 | 3 | 0 | 0 | 115 |\n", "| 27762 | 0 | 0 | 0 | 0 | 115 |\n", "| 27763 | 0 | 0 | 0 | 0 | 116 |\n", "| 27764 | 0 | 20 | 0 | 1 | 116 |\n", "| 27765 | 0 | 7 | 0 | 0 | 119 |" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vn = dataset(\"Ecdat\",\"VietNamI\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Predicting \"Days Ill\" based on historic data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "27765-element DataArray{Int32,1}:\n", " 7\n", " 4\n", " 0\n", " 3\n", " 10\n", " 0\n", " 0\n", " 0\n", " 4\n", " 7\n", " 1\n", " 5\n", " 3\n", " ⋮\n", " 3\n", " 10\n", " 0\n", " 1\n", " 0\n", " 0\n", " 30\n", " 3\n", " 0\n", " 0\n", " 20\n", " 7" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "days_ill = vn[:Illdays]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###We've isolated the signal we want to analyze, now we remove it from the feature set" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
PharvisLnhhExpAgeSexMarriedEducIllnessInjuryActdaysInsuranceCommune
102.7303633.7612male121000192
202.7372482.944439female001000167
302.2669352.56495male04000176
412.3927533.637586female131001123
513.1053353.295837male131000148
603.7608843.367296male19000120
703.1556093.663562female12000140
803.7246822.197225male05000157
922.8616913.7612female12200049
1032.6150774.234107male101000170
1112.6532432.772589male04100040
1212.1398573.663562female122000127
1322.6256833.555348female131000106
1412.7677461.94591female052000168
1502.8712422.302585male000000185
1621.9838113.135494male14300041
1701.2602013.091043male040000106
1802.2981783.401197male120000141
1932.1325083.332205male14300061
2002.0002313.258096male130000159
2102.0183753.218876male161000121
2211.8861422.639057female02100056
23102.9531251.098612female02200034
2402.3784813.526361male100000127
2501.5743763.555348male120000135
2632.4955143.637586female131000147
2712.3180773.7612male14100094
2802.0290454.248495female131000125
2911.7887543.610918female13300079
3002.0911072.079442female000000143
" ], "text/plain": [ "27765x11 DataFrame\n", "| Row | Pharvis | LnhhExp | Age | Sex | Married | Educ | Illness |\n", "|-------|---------|---------|---------|----------|---------|------|---------|\n", "| 1 | 0 | 2.73036 | 3.7612 | \"male\" | 1 | 2 | 1 |\n", "| 2 | 0 | 2.73725 | 2.94444 | \"female\" | 0 | 0 | 1 |\n", "| 3 | 0 | 2.26694 | 2.56495 | \"male\" | 0 | 4 | 0 |\n", "| 4 | 1 | 2.39275 | 3.63759 | \"female\" | 1 | 3 | 1 |\n", "| 5 | 1 | 3.10534 | 3.29584 | \"male\" | 1 | 3 | 1 |\n", "| 6 | 0 | 3.76088 | 3.3673 | \"male\" | 1 | 9 | 0 |\n", "| 7 | 0 | 3.15561 | 3.66356 | \"female\" | 1 | 2 | 0 |\n", "| 8 | 0 | 3.72468 | 2.19722 | \"male\" | 0 | 5 | 0 |\n", "| 9 | 2 | 2.86169 | 3.7612 | \"female\" | 1 | 2 | 2 |\n", "| 10 | 3 | 2.61508 | 4.23411 | \"male\" | 1 | 0 | 1 |\n", "| 11 | 1 | 2.65324 | 2.77259 | \"male\" | 0 | 4 | 1 |\n", "⋮\n", "| 27754 | 1 | 3.22879 | 3.98898 | \"female\" | 1 | 9 | 1 |\n", "| 27755 | 0 | 2.60798 | 3.82864 | \"male\" | 1 | 2 | 1 |\n", "| 27756 | 0 | 2.23453 | 3.97029 | \"female\" | 1 | 3 | 0 |\n", "| 27757 | 0 | 1.83282 | 3.61092 | \"male\" | 1 | 0 | 1 |\n", "| 27758 | 0 | 1.67896 | 4.15888 | \"male\" | 1 | 2 | 0 |\n", "| 27759 | 0 | 2.28975 | 2.89037 | \"male\" | 0 | 0 | 0 |\n", "| 27760 | 0 | 2.15735 | 4.00733 | \"female\" | 1 | 4 | 1 |\n", "| 27761 | 0 | 1.84729 | 1.60944 | \"female\" | 0 | 5 | 2 |\n", "| 27762 | 0 | 2.46146 | 2.83321 | \"female\" | 0 | 6 | 0 |\n", "| 27763 | 0 | 2.46026 | 2.56495 | \"female\" | 0 | 5 | 0 |\n", "| 27764 | 0 | 1.92017 | 4.00733 | \"female\" | 1 | 4 | 2 |\n", "| 27765 | 3 | 2.46883 | 3.13549 | \"male\" | 0 | 3 | 2 |\n", "\n", "| Row | Injury | Actdays | Insurance | Commune |\n", "|-------|--------|---------|-----------|---------|\n", "| 1 | 0 | 0 | 0 | 192 |\n", "| 2 | 0 | 0 | 0 | 167 |\n", "| 3 | 0 | 0 | 1 | 76 |\n", "| 4 | 0 | 0 | 1 | 123 |\n", "| 5 | 0 | 0 | 0 | 148 |\n", "| 6 | 0 | 0 | 1 | 20 |\n", "| 7 | 0 | 0 | 1 | 40 |\n", "| 8 | 0 | 0 | 1 | 57 |\n", "| 9 | 0 | 0 | 0 | 49 |\n", "| 10 | 0 | 0 | 0 | 170 |\n", "| 11 | 0 | 0 | 0 | 40 |\n", "⋮\n", "| 27754 | 0 | 0 | 0 | 86 |\n", "| 27755 | 0 | 0 | 0 | 90 |\n", "| 27756 | 0 | 0 | 0 | 90 |\n", "| 27757 | 0 | 0 | 0 | 90 |\n", "| 27758 | 0 | 0 | 0 | 90 |\n", "| 27759 | 0 | 0 | 0 | 91 |\n", "| 27760 | 0 | 0 | 1 | 108 |\n", "| 27761 | 0 | 0 | 0 | 115 |\n", "| 27762 | 0 | 0 | 0 | 115 |\n", "| 27763 | 0 | 0 | 0 | 116 |\n", "| 27764 | 0 | 0 | 1 | 116 |\n", "| 27765 | 0 | 0 | 0 | 119 |" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "delete!(vn,:Illdays)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Pkg.add(\"DecisionTree\")\n", "using DecisionTree" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The DecisionTree package works with Julia Arrays, so we make those conversions" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [], "source": [ "signals = convert(Array,days_ill)\n", "features = convert(Array,vn);" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "5 methods for generic function build_forest:" ], "text/plain": [ "# 5 methods for generic function \"build_forest\":\n", "build_forest{T<:FloatingPoint,U<:Real}(labels::Array{T<:FloatingPoint,1},features::Array{U<:Real,2},nsubfeatures::Integer,ntrees::Integer) at /home/juser/.julia/v0.3/DecisionTree/src/DecisionTree.jl:415\n", "build_forest{T<:FloatingPoint,U<:Real}(labels::Array{T<:FloatingPoint,1},features::Array{U<:Real,2},nsubfeatures::Integer,ntrees::Integer,maxlabels) at /home/juser/.julia/v0.3/DecisionTree/src/DecisionTree.jl:415\n", "build_forest{T<:FloatingPoint,U<:Real}(labels::Array{T<:FloatingPoint,1},features::Array{U<:Real,2},nsubfeatures::Integer,ntrees::Integer,maxlabels,partialsampling) at /home/juser/.julia/v0.3/DecisionTree/src/DecisionTree.jl:415\n", "build_forest(labels::Array{T,1},features::Array{T,2},nsubfeatures::Integer,ntrees::Integer) at /home/juser/.julia/v0.3/DecisionTree/src/DecisionTree.jl:245\n", "build_forest(labels::Array{T,1},features::Array{T,2},nsubfeatures::Integer,ntrees::Integer,partialsampling) at /home/juser/.julia/v0.3/DecisionTree/src/DecisionTree.jl:245" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "methods(build_forest)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Random Forest" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###build_forest parameters are:\n", "**signals** _array of the signal we want to calculate \n", "**features** the corresponding feature array that indicates those signals \n", "**festures used** the number features for the each split or branch of the tree \n", "**number of trees** trees in the forrest, larger takes longer, but could be more accurate. \n", "**sampling rate** number lowered from 1.0 to favor minority signals " ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Ensemble of Decision Trees\n", "Trees: 10\n", "Avg Leaves: 3674.7\n", "Avg Depth: 24.3" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = build_forest(signals,features,int(sqrt(length(features[1,:]))),10,.9)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Apply the algorithm to any 2D Matix of features" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "27765-element Array{Any,1}:\n", " 7\n", " 3\n", " 0\n", " 3\n", " 10\n", " 0\n", " 0\n", " 0\n", " 4\n", " 7\n", " 2\n", " 5\n", " 3\n", " ⋮\n", " 3\n", " 10\n", " 0\n", " 1\n", " 0\n", " 0\n", " 30\n", " 3\n", " 0\n", " 0\n", " 20\n", " 7" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions = apply_forest(model,features)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "32x32 Array{Int64,2}:\n", " 16566 0 5 10 1 1 0 … 0 0 0 0 0 0 0 0 1 0\n", " 1 290 68 68 16 5 0 0 0 0 0 0 0 0 0 0 0\n", " 1 4 1383 114 41 7 0 0 0 0 1 0 0 0 0 0 0\n", " 0 2 87 1856 45 7 2 0 0 0 0 0 0 1 0 0 0\n", " 1 1 32 95 1129 4 1 0 0 0 1 0 0 1 0 0 0\n", " 1 0 56 143 46 841 1 … 0 0 0 0 0 0 1 0 0 0\n", " 1 0 23 55 26 8 287 0 0 0 0 0 0 2 0 0 0\n", " 1 0 24 66 16 8 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 4 16 12 3 1 0 0 0 0 0 0 0 0 0 0\n", " 0 0 1 6 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 12 40 18 5 1 … 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0\n", " 1 0 2 4 6 1 0 0 0 0 0 0 0 1 0 0 0\n", " ⋮ ⋮ ⋱ ⋮ ⋮ \n", " 1 3 7 23 13 6 0 … 0 0 0 1 0 0 3 0 0 0\n", " 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0\n", " 0 0 0 0 1 2 0 … 0 0 0 56 0 0 0 0 0 0\n", " 0 0 1 1 0 1 0 0 0 0 0 2 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0\n", " 0 0 13 22 4 3 0 0 0 0 0 0 0 274 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0\n", " 0 1 7 18 6 4 1 … 0 0 0 0 0 0 0 0 153 0\n", " 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Classes: {0,1,2,3,4,5,6,7,8,9 … 22,23,24,25,26,27,28,29,30,60}\n", "Matrix: \n", "Accuracy: 0.9280388978930308\n", "Kappa: 0.8846820611436211" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confusion_matrix(signals,predictions)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "https://github.com/bensadeghi/DecisionTree.jl/tree/83765089feb5b2d30d72046ab78ca044841c827d \n", "http://bensadeghi.com/decision-trees-julia/ \n", "http://appliedpredictivemodeling.com/blog/2013/12/8/28rmc2lv96h8fw8700zm4nl50busep \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Julia 0.3.9", "language": "julia", "name": "julia-0.3" }, "language_info": { "name": "julia", "version": "0.3.9" } }, "nbformat": 4, "nbformat_minor": 0 }