{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "using CombineML.Util\n",
    "using CombineML.Transformers\n",
    "import RDatasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([76, 1, 118, 36, 102, 132, 28, 108, 90, 147  …  17, 113, 88, 77, 85, 47, 61, 144, 54, 60], [39, 68, 111, 24, 20, 114, 8, 52, 142, 44  …  53, 104, 93, 122, 46, 25, 30, 80, 23, 32])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris = RDatasets.dataset(\"datasets\", \"iris\")\n",
    "X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])\n",
    "y = convert(Array, iris[:Species]);\n",
    "\n",
    "# Split into training and test sets\n",
    "(train_ind, test_ind) = holdout(size(X, 1), 0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prunedTreeLearner = PrunedTree()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CombineML.Transformers.CombineMLTransformers.Pipeline(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:transformers, CombineML.Types.Transformer[CombineML.Transformers.CombineMLTransformers.OneHotEncoder(nothing, Dict(:nominal_column_values_map=>nothing,:nominal_columns=>nothing)), CombineML.Transformers.CombineMLTransformers.Imputer(nothing, Dict(:strategy=>mean)), CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))]),Pair{Symbol,Any}(:transformer_options, nothing)))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline = Pipeline(Dict(\n",
    "        :transformers => [\n",
    "            OneHotEncoder(), # Encodes nominal features into numeric\n",
    "            Imputer(), # Imputes NA values\n",
    "            #StandardScaler(), # Standardizes features \n",
    "            prunedTreeLearner # Predicts labels on instances\n",
    "        ]\n",
    "    ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train\n",
    "fit!(pipeline, X[train_ind, :], y[train_ind]);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Predict\n",
    "predictions = transform!(pipeline, X[test_ind, :]);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "97.77777777777777"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(predictions .== y[test_ind])/length(predictions)*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "97.77777777777777\n"
     ]
    }
   ],
   "source": [
    "result = score(:accuracy, y[test_ind], predictions)\n",
    "println(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "processModel (generic function with 1 method)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function processModel(learner)\n",
    "    iris = RDatasets.dataset(\"datasets\", \"iris\")\n",
    "    X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])\n",
    "    y = convert(Array, iris[:Species]);\n",
    "    (train_ind, test_ind) = holdout(size(X, 1), 0.3)\n",
    "    pipeline = Pipeline(Dict(\n",
    "            :transformers => [\n",
    "                OneHotEncoder(), # Encodes nominal features into numeric\n",
    "                Imputer(), # Imputes NA values\n",
    "                #StandardScaler(), # Standardizes features \n",
    "                learner # Predicts labels on instances\n",
    "            ]\n",
    "        ))\n",
    "    # Train\n",
    "    fit!(pipeline, X[train_ind, :], y[train_ind]);\n",
    "    # Predict\n",
    "    predictions = transform!(pipeline, X[test_ind, :]);\n",
    "    result = score(:accuracy, y[test_ind], predictions)\n",
    "    return(result)\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "64.44444444444444"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adaLearner = DecisionStumpAdaboost(Dict(\n",
    "  # Output to train against\n",
    "  # (:class).\n",
    "  :output => :class,\n",
    "  # Options specific to this implementation.\n",
    "  :impl_options => Dict(\n",
    "    # Number of boosting iterations.\n",
    "    :num_iterations => 7\n",
    "  )\n",
    "))\n",
    "processModel(adaLearner)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "93.33333333333333"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rfLearner = RandomForest(Dict(\n",
    "  :output => :class,\n",
    "  :impl_options => Dict(\n",
    "    :num_subfeatures => nothing,\n",
    "    :num_trees => 10,\n",
    "    :partial_sampling => 0.7\n",
    "  )\n",
    "))\n",
    "processModel(rfLearner)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "\u001b[91mArgumentError: Module ScikitLearn not found in current path.\nRun `Pkg.add(\"ScikitLearn\")` to install the ScikitLearn package.\u001b[39m",
     "output_type": "error",
     "traceback": [
      "\u001b[91mArgumentError: Module ScikitLearn not found in current path.\nRun `Pkg.add(\"ScikitLearn\")` to install the ScikitLearn package.\u001b[39m",
      "",
      "Stacktrace:",
      " [1] \u001b[1m_require\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Symbol\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:435\u001b[22m\u001b[22m",
      " [2] \u001b[1mrequire\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Symbol\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m./loading.jl:405\u001b[22m\u001b[22m"
     ]
    }
   ],
   "source": [
    "using ScikitLearn\n",
    "@sk_import neighbors: KNeighborsClassifier\n",
    "@sk_import svm: SVC\n",
    "\n",
    "skLearner = SKLLearner(Dict(\n",
    "  :output => :class,\n",
    "  #:learner => \"KNeighborsClassifier\",\n",
    "  :learner => \"SVC\",\n",
    "  :impl_options => Dict()\n",
    "))\n",
    "processModel(skLearner)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "97.77777777777777"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "voteLearner = VoteEnsemble(Dict(\n",
    "  :output => :class,\n",
    "  # Learners in voting committee.\n",
    "  :learners => [RandomForest(),PrunedTree(), DecisionStumpAdaboost()]\n",
    "))\n",
    "processModel(voteLearner)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "97.77777777777777"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bestLearner = BestLearner(Dict(\n",
    "  :output => :class,\n",
    "  :partition_generator => (X, y) -> kfold(size(X, 1), 5),\n",
    "  :selection_function => (learner_partition_scores) -> findmax(mean(learner_partition_scores, 2))[2],      \n",
    "  :score_type => Real,\n",
    "  :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest()],\n",
    "  :learner_options_grid => nothing\n",
    "))\n",
    "processModel(bestLearner)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "95.55555555555556"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stackLearner = StackEnsemble(Dict(\n",
    "  :output => :class,\n",
    "  :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest(),voteLearner,bestLearner],\n",
    "  :stacker => RandomForest(),\n",
    "  # Proportion of training set left to train stacker itself.\n",
    "  :stacker_training_proportion => 0.3,\n",
    "  :keep_original_features => false\n",
    "))\n",
    "processModel(stackLearner)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "acc = 94.0 +/- 4.0\n"
     ]
    }
   ],
   "source": [
    "results=@parallel (vcat) for i=1:30\n",
    "   processModel(stackLearner)\n",
    "end\n",
    "println(\"acc = \",round(mean(results)),\" +/- \",round(std(results)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "30-element Array{Float64,1}:\n",
       " 100.0   \n",
       "  93.3333\n",
       "  95.5556\n",
       "  88.8889\n",
       "  91.1111\n",
       "  93.3333\n",
       "  93.3333\n",
       "  95.5556\n",
       "  97.7778\n",
       "  95.5556\n",
       "  88.8889\n",
       "  93.3333\n",
       "  91.1111\n",
       "   ⋮     \n",
       "  97.7778\n",
       "  93.3333\n",
       "  95.5556\n",
       "  84.4444\n",
       "  95.5556\n",
       "  93.3333\n",
       "  93.3333\n",
       "  95.5556\n",
       "  97.7778\n",
       "  93.3333\n",
       "  95.5556\n",
       "  91.1111"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#svmcrt = CRTLearner(Dict(\n",
    "  # Output to train against\n",
    "  # (:class).\n",
    "  #:output => :class,\n",
    "  #:learner => \"rf\",\n",
    "  #:learner => \"svmLinear2\",\n",
    "  #:learner => \"rpart\",\n",
    "  #:learner => \"lda\",\n",
    "  #:impl_options => Dict()\n",
    "#))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 0.6.2",
   "language": "julia",
   "name": "julia-0.6"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "0.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}