In [3]:
using CombineML.Util
using CombineML.Transformers
import RDatasets

In [4]:
iris = RDatasets.dataset("datasets", "iris")
X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
y = convert(Array, iris[:Species]);

# Split into training and test sets
(train_ind, test_ind) = holdout(size(X, 1), 0.3)

([76, 1, 118, 36, 102, 132, 28, 108, 90, 147 … 17, 113, 88, 77, 85, 47, 61, 144, 54, 60], [39, 68, 111, 24, 20, 114, 8, 52, 142, 44 … 53, 104, 93, 122, 46, 25, 30, 80, 23, 32])

In [5]:
prunedTreeLearner = PrunedTree()

CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))

In [6]:
pipeline = Pipeline(Dict(
 :transformers => [
 OneHotEncoder(), # Encodes nominal features into numeric
 Imputer(), # Imputes NA values
 #StandardScaler(), # Standardizes features 
 prunedTreeLearner # Predicts labels on instances
 ]
 ))

CombineML.Transformers.CombineMLTransformers.Pipeline(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:transformers, CombineML.Types.Transformer[CombineML.Transformers.CombineMLTransformers.OneHotEncoder(nothing, Dict(:nominal_column_values_map=>nothing,:nominal_columns=>nothing)), CombineML.Transformers.CombineMLTransformers.Imputer(nothing, Dict(:strategy=>mean)), CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))]),Pair{Symbol,Any}(:transformer_options, nothing)))

In [7]:
# Train
fit!(pipeline, X[train_ind, :], y[train_ind]);

In [8]:
# Predict
predictions = transform!(pipeline, X[test_ind, :]);

In [9]:
sum(predictions .== y[test_ind])/length(predictions)*100

97.77777777777777

In [10]:
result = score(:accuracy, y[test_ind], predictions)
println(result)

97.77777777777777


In [11]:
function processModel(learner)
 iris = RDatasets.dataset("datasets", "iris")
 X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
 y = convert(Array, iris[:Species]);
 (train_ind, test_ind) = holdout(size(X, 1), 0.3)
 pipeline = Pipeline(Dict(
 :transformers => [
 OneHotEncoder(), # Encodes nominal features into numeric
 Imputer(), # Imputes NA values
 #StandardScaler(), # Standardizes features 
 learner # Predicts labels on instances
 ]
 ))
 # Train
 fit!(pipeline, X[train_ind, :], y[train_ind]);
 # Predict
 predictions = transform!(pipeline, X[test_ind, :]);
 result = score(:accuracy, y[test_ind], predictions)
 return(result)
end

processModel (generic function with 1 method)

In [12]:
adaLearner = DecisionStumpAdaboost(Dict(
 # Output to train against
 # (:class).
 :output => :class,
 # Options specific to this implementation.
 :impl_options => Dict(
 # Number of boosting iterations.
 :num_iterations => 7
 )
))
processModel(adaLearner)

64.44444444444444

In [13]:
rfLearner = RandomForest(Dict(
 :output => :class,
 :impl_options => Dict(
 :num_subfeatures => nothing,
 :num_trees => 10,
 :partial_sampling => 0.7
 )
))
processModel(rfLearner)

93.33333333333333

In [14]:
using ScikitLearn
@sk_import neighbors: KNeighborsClassifier
@sk_import svm: SVC

skLearner = SKLLearner(Dict(
 :output => :class,
 #:learner => "KNeighborsClassifier",
 :learner => "SVC",
 :impl_options => Dict()
))
processModel(skLearner)

LoadError: [91mArgumentError: Module ScikitLearn not found in current path.
Run `Pkg.add("ScikitLearn")` to install the ScikitLearn package.[39m

In [15]:
voteLearner = VoteEnsemble(Dict(
 :output => :class,
 # Learners in voting committee.
 :learners => [RandomForest(),PrunedTree(), DecisionStumpAdaboost()]
))
processModel(voteLearner)

97.77777777777777

In [16]:
bestLearner = BestLearner(Dict(
 :output => :class,
 :partition_generator => (X, y) -> kfold(size(X, 1), 5),
 :selection_function => (learner_partition_scores) -> findmax(mean(learner_partition_scores, 2))[2], 
 :score_type => Real,
 :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest()],
 :learner_options_grid => nothing
))
processModel(bestLearner)

97.77777777777777

In [20]:
stackLearner = StackEnsemble(Dict(
 :output => :class,
 :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest(),voteLearner,bestLearner],
 :stacker => RandomForest(),
 # Proportion of training set left to train stacker itself.
 :stacker_training_proportion => 0.3,
 :keep_original_features => false
))
processModel(stackLearner)

95.55555555555556

In [21]:
results=@parallel (vcat) for i=1:30
 processModel(stackLearner)
end
println("acc = ",round(mean(results))," +/- ",round(std(results)))

acc = 94.0 +/- 4.0


In [22]:
results

30-element Array{Float64,1}:
 100.0 
 93.3333
 95.5556
 88.8889
 91.1111
 93.3333
 93.3333
 95.5556
 97.7778
 95.5556
 88.8889
 93.3333
 91.1111
 ⋮ 
 97.7778
 93.3333
 95.5556
 84.4444
 95.5556
 93.3333
 93.3333
 95.5556
 97.7778
 93.3333
 95.5556
 91.1111

In [17]:
#svmcrt = CRTLearner(Dict(
 # Output to train against
 # (:class).
 #:output => :class,
 #:learner => "rf",
 #:learner => "svmLinear2",
 #:learner => "rpart",
 #:learner => "lda",
 #:impl_options => Dict()
#))