{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "using CombineML.Util\n", "using CombineML.Transformers\n", "import RDatasets" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([76, 1, 118, 36, 102, 132, 28, 108, 90, 147 … 17, 113, 88, 77, 85, 47, 61, 144, 54, 60], [39, 68, 111, 24, 20, 114, 8, 52, 142, 44 … 53, 104, 93, 122, 46, 25, 30, 80, 23, 32])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris = RDatasets.dataset(\"datasets\", \"iris\")\n", "X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])\n", "y = convert(Array, iris[:Species]);\n", "\n", "# Split into training and test sets\n", "(train_ind, test_ind) = holdout(size(X, 1), 0.3)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prunedTreeLearner = PrunedTree()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CombineML.Transformers.CombineMLTransformers.Pipeline(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:transformers, CombineML.Types.Transformer[CombineML.Transformers.CombineMLTransformers.OneHotEncoder(nothing, Dict(:nominal_column_values_map=>nothing,:nominal_columns=>nothing)), CombineML.Transformers.CombineMLTransformers.Imputer(nothing, Dict(:strategy=>mean)), CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))]),Pair{Symbol,Any}(:transformer_options, nothing)))" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline = Pipeline(Dict(\n", " :transformers => [\n", " OneHotEncoder(), # Encodes nominal features into numeric\n", " Imputer(), # Imputes NA values\n", " #StandardScaler(), # Standardizes features \n", " prunedTreeLearner # Predicts labels on instances\n", " ]\n", " ))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Train\n", "fit!(pipeline, X[train_ind, :], y[train_ind]);" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Predict\n", "predictions = transform!(pipeline, X[test_ind, :]);" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "97.77777777777777" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(predictions .== y[test_ind])/length(predictions)*100" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "97.77777777777777\n" ] } ], "source": [ "result = score(:accuracy, y[test_ind], predictions)\n", "println(result)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "processModel (generic function with 1 method)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "function processModel(learner)\n", " iris = RDatasets.dataset(\"datasets\", \"iris\")\n", " X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])\n", " y = convert(Array, iris[:Species]);\n", " (train_ind, test_ind) = holdout(size(X, 1), 0.3)\n", " pipeline = Pipeline(Dict(\n", " :transformers => [\n", " OneHotEncoder(), # Encodes nominal features into numeric\n", " Imputer(), # Imputes NA values\n", " #StandardScaler(), # Standardizes features \n", " learner # Predicts labels on instances\n", " ]\n", " ))\n", " # Train\n", " fit!(pipeline, X[train_ind, :], y[train_ind]);\n", " # Predict\n", " predictions = transform!(pipeline, X[test_ind, :]);\n", " result = score(:accuracy, y[test_ind], predictions)\n", " return(result)\n", "end" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "64.44444444444444" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adaLearner = DecisionStumpAdaboost(Dict(\n", " # Output to train against\n", " # (:class).\n", " :output => :class,\n", " # Options specific to this implementation.\n", " :impl_options => Dict(\n", " # Number of boosting iterations.\n", " :num_iterations => 7\n", " )\n", "))\n", "processModel(adaLearner)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "93.33333333333333" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rfLearner = RandomForest(Dict(\n", " :output => :class,\n", " :impl_options => Dict(\n", " :num_subfeatures => nothing,\n", " :num_trees => 10,\n", " :partial_sampling => 0.7\n", " )\n", "))\n", "processModel(rfLearner)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "ename": "LoadError", "evalue": "\u001b[91mArgumentError: Module ScikitLearn not found in current path.\nRun `Pkg.add(\"ScikitLearn\")` to install the ScikitLearn package.\u001b[39m", "output_type": "error", "traceback": [ "\u001b[91mArgumentError: Module ScikitLearn not found in current path.\nRun `Pkg.add(\"ScikitLearn\")` to install the ScikitLearn package.\u001b[39m", "", "Stacktrace:", " [1] \u001b[1m_require\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Symbol\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:435\u001b[22m\u001b[22m", " [2] \u001b[1mrequire\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Symbol\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:405\u001b[22m\u001b[22m" ] } ], "source": [ "using ScikitLearn\n", "@sk_import neighbors: KNeighborsClassifier\n", "@sk_import svm: SVC\n", "\n", "skLearner = SKLLearner(Dict(\n", " :output => :class,\n", " #:learner => \"KNeighborsClassifier\",\n", " :learner => \"SVC\",\n", " :impl_options => Dict()\n", "))\n", "processModel(skLearner)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "97.77777777777777" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "voteLearner = VoteEnsemble(Dict(\n", " :output => :class,\n", " # Learners in voting committee.\n", " :learners => [RandomForest(),PrunedTree(), DecisionStumpAdaboost()]\n", "))\n", "processModel(voteLearner)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "97.77777777777777" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bestLearner = BestLearner(Dict(\n", " :output => :class,\n", " :partition_generator => (X, y) -> kfold(size(X, 1), 5),\n", " :selection_function => (learner_partition_scores) -> findmax(mean(learner_partition_scores, 2))[2], \n", " :score_type => Real,\n", " :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest()],\n", " :learner_options_grid => nothing\n", "))\n", "processModel(bestLearner)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "95.55555555555556" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stackLearner = StackEnsemble(Dict(\n", " :output => :class,\n", " :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest(),voteLearner,bestLearner],\n", " :stacker => RandomForest(),\n", " # Proportion of training set left to train stacker itself.\n", " :stacker_training_proportion => 0.3,\n", " :keep_original_features => false\n", "))\n", "processModel(stackLearner)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "acc = 94.0 +/- 4.0\n" ] } ], "source": [ "results=@parallel (vcat) for i=1:30\n", " processModel(stackLearner)\n", "end\n", "println(\"acc = \",round(mean(results)),\" +/- \",round(std(results)))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "30-element Array{Float64,1}:\n", " 100.0 \n", " 93.3333\n", " 95.5556\n", " 88.8889\n", " 91.1111\n", " 93.3333\n", " 93.3333\n", " 95.5556\n", " 97.7778\n", " 95.5556\n", " 88.8889\n", " 93.3333\n", " 91.1111\n", " ⋮ \n", " 97.7778\n", " 93.3333\n", " 95.5556\n", " 84.4444\n", " 95.5556\n", " 93.3333\n", " 93.3333\n", " 95.5556\n", " 97.7778\n", " 93.3333\n", " 95.5556\n", " 91.1111" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#svmcrt = CRTLearner(Dict(\n", " # Output to train against\n", " # (:class).\n", " #:output => :class,\n", " #:learner => \"rf\",\n", " #:learner => \"svmLinear2\",\n", " #:learner => \"rpart\",\n", " #:learner => \"lda\",\n", " #:impl_options => Dict()\n", "#))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Julia 0.6.2", "language": "julia", "name": "julia-0.6" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "0.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }