{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Time Series Classification Demo" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "using TSML" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Let's add workers for parallel processing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "using Distributed\n", "nprocs()==1 && addprocs()\n", "nworkers()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Load TSML Modules and other Dependencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "@everywhere using TSML\n", "@everywhere using TSMLextra\n", "@everywhere using DataFrames\n", "@everywhere using Random\n", "@everywhere using Statistics\n", "@everywhere using StatsBase: iqr\n", "@everywhere using RDatasets\n", "ENV[\"COLUMNS\"]=1000; # for dataframe column size" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Initialize ML models from Julia, Caret, and Scikitlearn" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Caret ML\n", "@everywhere caret_svmlinear = CaretLearner(Dict(:learner=>\"svmLinear\"))\n", "@everywhere caret_treebag = CaretLearner(Dict(:learner=>\"treebag\"))\n", "@everywhere caret_rpart = CaretLearner(Dict(:learner=>\"rpart\"))\n", "@everywhere caret_rf = CaretLearner(Dict(:learner=>\"rf\"))\n", "\n", "# ScikitLearn ML\n", "@everywhere sk_ridge = SKLearner(Dict(:learner=>\"RidgeClassifier\"))\n", "@everywhere sk_sgd = SKLearner(Dict(:learner=>\"SGDClassifier\"))\n", "@everywhere sk_knn = SKLearner(Dict(:learner=>\"KNeighborsClassifier\"))\n", "@everywhere sk_gb = SKLearner(Dict(:learner=>\"GradientBoostingClassifier\",:impl_args=>Dict(:n_estimators=>10)))\n", "@everywhere sk_extratree = SKLearner(Dict(:learner=>\"ExtraTreesClassifier\",:impl_args=>Dict(:n_estimators=>10)))\n", "@everywhere sk_rf = SKLearner(Dict(:learner=>\"RandomForestClassifier\",:impl_args=>Dict(:n_estimators=>10)))\n", "\n", "# Julia ML\n", "@everywhere jrf = RandomForest(Dict(:impl_args=>Dict(:num_trees=>300)))\n", "@everywhere jpt = PrunedTree()\n", "@everywhere jada = Adaboost()\n", "\n", "# Julia Ensembles\n", "@everywhere jvote_ens=VoteEnsemble(Dict(:learners=>[jrf,jpt,sk_gb,sk_extratree,sk_rf]))\n", "@everywhere jstack_ens=StackEnsemble(Dict(:learners=>[jrf,jpt,sk_gb,sk_extratree,sk_rf]))\n", "@everywhere jbest_ens=BestLearner(Dict(:learners=>[jrf,sk_gb,sk_rf]))\n", "@everywhere jsuper_ens=VoteEnsemble(Dict(:learners=>[jvote_ens,jstack_ens,jbest_ens,sk_rf,sk_gb]))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Functions for feature extraction and prediction" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "@everywhere function predict(learner,data,train_ind,test_ind)\n", " features = convert(Matrix,data[:, 1:(end-1)])\n", " labels = convert(Array,data[:, end])\n", " # Create pipeline\n", " pipeline = Pipeline(\n", " Dict(\n", " :transformers => [\n", " OneHotEncoder(), # Encodes nominal features into numeric\n", " Imputer(), # Imputes NA values\n", " StandardScaler(),\n", " learner # Predicts labels on instances\n", " ]\n", " )\n", " )\n", " # Train\n", " fit!(pipeline, features[train_ind, :], labels[train_ind]);\n", " # Predict\n", " predictions = transform!(pipeline, features[test_ind, :]);\n", " # Assess predictions\n", " result = score(:accuracy, labels[test_ind], predictions)\n", " return result\n", "end" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Data processing and feature extraction" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "@everywhere function extract_features_from_timeseries(datadir)\n", " println(\"*** Extracting features ***\")\n", " mdata = getstats(datadir)\n", " mdata[!,:dtype] = mdata[!,:dtype] |> Array{String}\n", " return mdata[!,3:(end-1)]\n", "end\n", "\n", "datadir = joinpath(\"data/\")\n", "tsdata = extract_features_from_timeseries(datadir)\n", "first(tsdata,5)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Run in parallel all models in different trials" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "function parallelmodel(learners::Dict,data::DataFrame;trials=5)\n", " models=collect(keys(learners))\n", " ctable=@distributed (vcat) for i=1:trials\n", " # Split into training and test sets\n", " Random.seed!(3i)\n", " (train_ind, test_ind) = holdout(size(data, 1), 0.20)\n", " acc=@distributed (vcat) for model in models\n", " res=predict(learners[model],data,train_ind,test_ind)\n", " println(\"trial \",i,\", \",model,\" => \",round(res))\n", " [model res i]\n", " end\n", " acc\n", " end\n", " df = ctable |> DataFrame\n", " rename!(df,:x1=>:model,:x2=>:acc,:x3=>:trial)\n", " gp=by(df,:model) do x\n", " DataFrame(mean=mean(x.acc),std=std(x.acc),n=length(x.acc)) \n", " end\n", " sort!(gp,:mean,rev=true)\n", " return gp\n", "end" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "learners=Dict(\n", " :jvote_ens=>jvote_ens,:jstack_ens=>jstack_ens,:jbest_ens=>jbest_ens,\n", " :jrf => jrf,:jada=>jada,:jsuper_ens=>jsuper_ens,#:crt_rpart=>caret_rpart,\n", " :crt_svmlinear=>caret_svmlinear,:crt_treebag=>caret_treebag,#:crt_rf=>caret_rf, \n", " :skl_knn=>sk_knn,:skl_gb=>sk_gb,:skl_extratree=>sk_extratree,\n", " :sk_rf => sk_rf\n", ");\n", "\n", "df = parallelmodel(learners,tsdata;trials=3)" ] } ], "metadata": { "@webio": { "lastCommId": null, "lastKernelId": null }, "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Julia 1.2.0", "language": "julia", "name": "julia-1.2" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.2.0" } }, "nbformat": 4, "nbformat_minor": 2 }