{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Load TSML filters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "using TSML" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create artificial data function" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "using DataFrames\n", "using Dates\n", "using Random\n", "\n", "ENV[\"COLUMNS\"]=1000 # for dataframe column size\n", "\n", "function generateXY()\n", " Random.seed!(123)\n", " gdate = DateTime(2014,1,1):Dates.Minute(15):DateTime(2014,1,5)\n", " gval = Array{Union{Missing,Float64}}(rand(length(gdate)))\n", " gmissing = floor(0.30*length(gdate)) |> Integer\n", " gndxmissing = Random.shuffle(1:length(gdate))[1:gmissing]\n", " X = DataFrame(Date=gdate,Value=gval)\n", " X.Value[gndxmissing] .= missing\n", " Y = rand(length(gdate))\n", " (X,Y)\n", "end;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Generate artificial data with missing" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

10 rows × 2 columns

DateValue
DateTimeFloat64⍰
12014-01-01T00:00:000.768448
22014-01-01T00:15:000.940515
32014-01-01T00:30:000.673959
42014-01-01T00:45:000.395453
52014-01-01T01:00:00missing
62014-01-01T01:15:000.662555
72014-01-01T01:30:000.586022
82014-01-01T01:45:00missing
92014-01-01T02:00:000.26864
102014-01-01T02:15:00missing
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& Date & Value\\\\\n", "\t\\hline\n", "\t& DateTime & Float64⍰\\\\\n", "\t\\hline\n", "\t1 & 2014-01-01T00:00:00 & 0.768448 \\\\\n", "\t2 & 2014-01-01T00:15:00 & 0.940515 \\\\\n", "\t3 & 2014-01-01T00:30:00 & 0.673959 \\\\\n", "\t4 & 2014-01-01T00:45:00 & 0.395453 \\\\\n", "\t5 & 2014-01-01T01:00:00 & \\\\\n", "\t6 & 2014-01-01T01:15:00 & 0.662555 \\\\\n", "\t7 & 2014-01-01T01:30:00 & 0.586022 \\\\\n", "\t8 & 2014-01-01T01:45:00 & \\\\\n", "\t9 & 2014-01-01T02:00:00 & 0.26864 \\\\\n", "\t10 & 2014-01-01T02:15:00 & \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "10×2 DataFrame\n", "│ Row │ Date │ Value │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64⍰\u001b[39m │\n", "├─────┼─────────────────────┼──────────┤\n", "│ 1 │ 2014-01-01T00:00:00 │ 0.768448 │\n", "│ 2 │ 2014-01-01T00:15:00 │ 0.940515 │\n", "│ 3 │ 2014-01-01T00:30:00 │ 0.673959 │\n", "│ 4 │ 2014-01-01T00:45:00 │ 0.395453 │\n", "│ 5 │ 2014-01-01T01:00:00 │ \u001b[90mmissing\u001b[39m │\n", "│ 6 │ 2014-01-01T01:15:00 │ 0.662555 │\n", "│ 7 │ 2014-01-01T01:30:00 │ 0.586022 │\n", "│ 8 │ 2014-01-01T01:45:00 │ \u001b[90mmissing\u001b[39m │\n", "│ 9 │ 2014-01-01T02:00:00 │ 0.26864 │\n", "│ 10 │ 2014-01-01T02:15:00 │ \u001b[90mmissing\u001b[39m │" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(df,outY)=generateXY()\n", "first(df,10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## User Pipeline and Plotter to plot artificial data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2014-01-01\n", "\n", "\n", "2014-01-02\n", "\n", "\n", "2014-01-03\n", "\n", "\n", "2014-01-04\n", "\n", "\n", "2014-01-05\n", "\n", "\n", "0.00\n", "\n", "\n", "0.25\n", "\n", "\n", "0.50\n", "\n", "\n", "0.75\n", "\n", "\n", "1.00\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pltr=Plotter(Dict(:interactive => false))\n", "\n", "mypipeline = Pipeline(Dict(\n", " :transformers => [pltr]\n", " )\n", ")\n", "\n", "fit!(mypipeline, df)\n", "transform!(mypipeline, df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get statistics including blocks of missing data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

1 rows × 26 columns

tstarttendsfreqcountmaxminmedianmeanq1q2q25q75q8q9kurtosisskewnessvariationentropyautocorpacfbmedianbmeanbq25bq75bminbmax
DateTimeDateTimeFloat64Int64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
12014-01-01T00:00:002014-01-05T00:00:000.2493512700.9954140.0004123990.5211840.5058730.1215820.2131520.2796230.7457840.7814250.870951-1.14079-0.0653120.54621169.52030.3206050.3127061.01.321841.02.01.03.0
" ], "text/latex": [ "\\begin{tabular}{r|cccccccccccccccccccccccccc}\n", "\t& tstart & tend & sfreq & count & max & min & median & mean & q1 & q2 & q25 & q75 & q8 & q9 & kurtosis & skewness & variation & entropy & autocor & pacf & bmedian & bmean & bq25 & bq75 & bmin & bmax\\\\\n", "\t\\hline\n", "\t& DateTime & DateTime & Float64 & Int64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & 2014-01-01T00:00:00 & 2014-01-05T00:00:00 & 0.249351 & 270 & 0.995414 & 0.000412399 & 0.521184 & 0.505873 & 0.121582 & 0.213152 & 0.279623 & 0.745784 & 0.781425 & 0.870951 & -1.14079 & -0.065312 & 0.546211 & 69.5203 & 0.320605 & 0.312706 & 1.0 & 1.32184 & 1.0 & 2.0 & 1.0 & 3.0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "1×26 DataFrame\n", "│ Row │ tstart │ tend │ sfreq │ count │ max │ min │ median │ mean │ q1 │ q2 │ q25 │ q75 │ q8 │ q9 │ kurtosis │ skewness │ variation │ entropy │ autocor │ pacf │ bmedian │ bmean │ bq25 │ bq75 │ bmin │ bmax │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────────────────┼─────────────────────┼──────────┼───────┼──────────┼─────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼─────────┼──────────┼──────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 2014-01-01T00:00:00 │ 2014-01-05T00:00:00 │ 0.249351 │ 270 │ 0.995414 │ 0.000412399 │ 0.521184 │ 0.505873 │ 0.121582 │ 0.213152 │ 0.279623 │ 0.745784 │ 0.781425 │ 0.870951 │ -1.14079 │ -0.065312 │ 0.546211 │ 69.5203 │ 0.320605 │ 0.312706 │ 1.0 │ 1.32184 │ 1.0 │ 2.0 │ 1.0 │ 3.0 │" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "statfier = Statifier(Dict(:processmissing=>true))\n", "\n", "mypipeline = Pipeline(Dict(\n", " :transformers => [statfier]\n", " )\n", ")\n", "\n", "fit!(mypipeline, df)\n", "res = transform!(mypipeline, df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Use Pipeline: aggregate, impute, and plot " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2014-01-01\n", "\n", "\n", "2014-01-02\n", "\n", "\n", "2014-01-03\n", "\n", "\n", "2014-01-04\n", "\n", "\n", "2014-01-05\n", "\n", "\n", "0.00\n", "\n", "\n", "0.25\n", "\n", "\n", "0.50\n", "\n", "\n", "0.75\n", "\n", "\n", "1.00\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "valgator = DateValgator(Dict(:dateinterval=>Dates.Hour(1)))\n", "valnner = DateValNNer(Dict(:dateinterval=>Dates.Hour(1)))\n", "\n", "mypipeline = Pipeline(Dict(\n", " :transformers => [valgator,pltr]\n", " )\n", ")\n", "\n", "fit!(mypipeline, df)\n", "transform!(mypipeline, df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Try real data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "fname = joinpath(dirname(pathof(TSML)),\"../data/testdata.csv\")\n", "csvreader = CSVDateValReader(Dict(:filename=>fname,:dateformat=>\"dd/mm/yyyy HH:MM\"))\n", "\n", "outputname = joinpath(dirname(pathof(TSML)),\"/tmp/testdata_output.csv\")\n", "csvwriter = CSVDateValWriter(Dict(:filename=>outputname))\n", "\n", "valgator = DateValgator(Dict(:dateinterval=>Dates.Hour(1)))\n", "valnner = DateValNNer(Dict(:dateinterval=>Dates.Hour(1)))\n", "stfier = Statifier(Dict(:processmissing=>true))\n", "outliernicer = Outliernicer(Dict(:dateinterval=>Dates.Hour(1)));" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot real data with missing values" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2014-01-01\n", "\n", "\n", "2014-04-01\n", "\n", "\n", "2014-07-01\n", "\n", "\n", "2014-10-01\n", "\n", "\n", "2015-01-01\n", "\n", "\n", "10\n", "\n", "\n", "12\n", "\n", "\n", "14\n", "\n", "\n", "16\n", "\n", "\n", "18\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpipeline1 = Pipeline(Dict(\n", " :transformers => [csvreader,valgator,pltr]\n", " )\n", ")\n", "\n", "fit!(mpipeline1)\n", "transform!(mpipeline1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get statistics including blocks of missing data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

1 rows × 26 columns

tstarttendsfreqcountmaxminmedianmeanq1q2q25q75q8q9kurtosisskewnessvariationentropyautocorpacfbmedianbmeanbq25bq75bminbmax
DateTimeDateTimeFloat64Int64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
12014-01-01T00:00:002015-01-01T00:00:000.999886383018.88.510.3511.5579.910.010.012.313.016.00.7306351.412830.200055-1.09145e54.393151.046445.010.55893.06.01.02380.0
" ], "text/latex": [ "\\begin{tabular}{r|cccccccccccccccccccccccccc}\n", "\t& tstart & tend & sfreq & count & max & min & median & mean & q1 & q2 & q25 & q75 & q8 & q9 & kurtosis & skewness & variation & entropy & autocor & pacf & bmedian & bmean & bq25 & bq75 & bmin & bmax\\\\\n", "\t\\hline\n", "\t& DateTime & DateTime & Float64 & Int64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & 2014-01-01T00:00:00 & 2015-01-01T00:00:00 & 0.999886 & 3830 & 18.8 & 8.5 & 10.35 & 11.557 & 9.9 & 10.0 & 10.0 & 12.3 & 13.0 & 16.0 & 0.730635 & 1.41283 & 0.200055 & -1.09145e5 & 4.39315 & 1.04644 & 5.0 & 10.5589 & 3.0 & 6.0 & 1.0 & 2380.0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "1×26 DataFrame\n", "│ Row │ tstart │ tend │ sfreq │ count │ max │ min │ median │ mean │ q1 │ q2 │ q25 │ q75 │ q8 │ q9 │ kurtosis │ skewness │ variation │ entropy │ autocor │ pacf │ bmedian │ bmean │ bq25 │ bq75 │ bmin │ bmax │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────────────────┼─────────────────────┼──────────┼───────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼──────────┼──────────┼───────────┼────────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 2014-01-01T00:00:00 │ 2015-01-01T00:00:00 │ 0.999886 │ 3830 │ 18.8 │ 8.5 │ 10.35 │ 11.557 │ 9.9 │ 10.0 │ 10.0 │ 12.3 │ 13.0 │ 16.0 │ 0.730635 │ 1.41283 │ 0.200055 │ -1.09145e5 │ 4.39315 │ 1.04644 │ 5.0 │ 10.5589 │ 3.0 │ 6.0 │ 1.0 │ 2380.0 │" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpipeline1 = Pipeline(Dict(\n", " :transformers => [csvreader,valgator,stfier]\n", " )\n", ")\n", "\n", "fit!(mpipeline1)\n", "respipe1 = transform!(mpipeline1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Try imputing and get statistics" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

1 rows × 26 columns

tstarttendsfreqcountmaxminmedianmeanq1q2q25q75q8q9kurtosisskewnessvariationentropyautocorpacfbmedianbmeanbq25bq75bminbmax
DateTimeDateTimeFloat64Int64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
12014-01-01T00:00:002015-01-01T00:00:000.999886876118.88.510.011.13629.9510.010.011.512.014.952.372741.874520.187997-2.36714e54.478861.06917NaNNaNNaNNaNNaNNaN
" ], "text/latex": [ "\\begin{tabular}{r|cccccccccccccccccccccccccc}\n", "\t& tstart & tend & sfreq & count & max & min & median & mean & q1 & q2 & q25 & q75 & q8 & q9 & kurtosis & skewness & variation & entropy & autocor & pacf & bmedian & bmean & bq25 & bq75 & bmin & bmax\\\\\n", "\t\\hline\n", "\t& DateTime & DateTime & Float64 & Int64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64\\\\\n", "\t\\hline\n", "\t1 & 2014-01-01T00:00:00 & 2015-01-01T00:00:00 & 0.999886 & 8761 & 18.8 & 8.5 & 10.0 & 11.1362 & 9.95 & 10.0 & 10.0 & 11.5 & 12.0 & 14.95 & 2.37274 & 1.87452 & 0.187997 & -2.36714e5 & 4.47886 & 1.06917 & NaN & NaN & NaN & NaN & NaN & NaN \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "1×26 DataFrame\n", "│ Row │ tstart │ tend │ sfreq │ count │ max │ min │ median │ mean │ q1 │ q2 │ q25 │ q75 │ q8 │ q9 │ kurtosis │ skewness │ variation │ entropy │ autocor │ pacf │ bmedian │ bmean │ bq25 │ bq75 │ bmin │ bmax │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────────────────┼─────────────────────┼──────────┼───────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼──────────┼──────────┼───────────┼────────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n", "│ 1 │ 2014-01-01T00:00:00 │ 2015-01-01T00:00:00 │ 0.999886 │ 8761 │ 18.8 │ 8.5 │ 10.0 │ 11.1362 │ 9.95 │ 10.0 │ 10.0 │ 11.5 │ 12.0 │ 14.95 │ 2.37274 │ 1.87452 │ 0.187997 │ -2.36714e5 │ 4.47886 │ 1.06917 │ NaN │ NaN │ NaN │ NaN │ NaN │ NaN │" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpipeline2 = Pipeline(Dict(\n", " :transformers => [csvreader,valgator,valnner,stfier]\n", " )\n", ")\n", "\n", "fit!(mpipeline2)\n", "respipe2 = transform!(mpipeline2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot imputted data" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2014-01-01\n", "\n", "\n", "2014-04-01\n", "\n", "\n", "2014-07-01\n", "\n", "\n", "2014-10-01\n", "\n", "\n", "2015-01-01\n", "\n", "\n", "10\n", "\n", "\n", "12\n", "\n", "\n", "14\n", "\n", "\n", "16\n", "\n", "\n", "18\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mpipeline2 = Pipeline(Dict(\n", " :transformers => [csvreader,valgator,valnner,pltr]\n", " )\n", ")\n", "\n", "fit!(mpipeline2)\n", "transform!(mpipeline2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Monotonicer" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "regularfile = joinpath(dirname(pathof(TSML)),\"../data/typedetection/regular.csv\")\n", "monofile = joinpath(dirname(pathof(TSML)),\"../data/typedetection/monotonic.csv\")\n", "dailymonofile = joinpath(dirname(pathof(TSML)),\"../data/typedetection/dailymonotonic.csv\")\n", "\n", "regularfilecsv = CSVDateValReader(Dict(:filename=>regularfile,:dateformat=>\"dd/mm/yyyy HH:MM\"))\n", "monofilecsv = CSVDateValReader(Dict(:filename=>monofile,:dateformat=>\"dd/mm/yyyy HH:MM\"))\n", "dailymonofilecsv = CSVDateValReader(Dict(:filename=>dailymonofile,:dateformat=>\"dd/mm/yyyy HH:MM\"))\n", "\n", "valgator = DateValgator(Dict(:dateinterval=>Dates.Hour(1)))\n", "valnner = DateValNNer(Dict(:dateinterval=>Dates.Hour(1)))\n", "stfier = Statifier(Dict(:processmissing=>true))\n", "mono = Monotonicer(Dict())\n", "stfier = Statifier(Dict(:processmissing=>true))\n", "outliernicer = Outliernicer(Dict(:dateinterval=>Dates.Hour(1)));" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot of monotonic data" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2016-01-06\n", "\n", "\n", "2016-01-13\n", "\n", "\n", "2016-01-20\n", "\n", "\n", "2016-01-27\n", "\n", "\n", "5.775×10\n", "\n", "\n", "7\n", "\n", "\n", "5.778×10\n", "\n", "\n", "7\n", "\n", "\n", "5.781×10\n", "\n", "\n", "7\n", "\n", "\n", "5.784×10\n", "\n", "\n", "7\n", "\n", "\n", "5.787×10\n", "\n", "\n", "7\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "monopipeline = Pipeline(Dict(\n", " :transformers => [monofilecsv,valgator,valnner,pltr]\n", " )\n", ")\n", "\n", "fit!(monopipeline)\n", "transform!(monopipeline)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot after normalization of monotonic data" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2016-01-06\n", "\n", "\n", "2016-01-13\n", "\n", "\n", "2016-01-20\n", "\n", "\n", "2016-01-27\n", "\n", "\n", "0\n", "\n", "\n", "2500\n", "\n", "\n", "5000\n", "\n", "\n", "7500\n", "\n", "\n", "10000\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "monopipeline = Pipeline(Dict(\n", " :transformers => [monofilecsv,valgator,valnner,mono,pltr]\n", " )\n", ")\n", "\n", "fit!(monopipeline)\n", "transform!(monopipeline)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot with Monotonicer and Outliernicer" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2016-01-06\n", "\n", "\n", "2016-01-13\n", "\n", "\n", "2016-01-20\n", "\n", "\n", "2016-01-27\n", "\n", "\n", "220\n", "\n", "\n", "240\n", "\n", "\n", "260\n", "\n", "\n", "280\n", "\n", "\n", "300\n", "\n", "\n", "320\n", "\n", "\n", "340\n", "\n", "\n", "360\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "monopipeline = Pipeline(Dict(\n", " :transformers => [monofilecsv,valgator,valnner,mono,outliernicer,pltr]\n", " )\n", ")\n", "\n", "fit!(monopipeline)\n", "transform!(monopipeline)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot of daily monotonic" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2019-03-01\n", "\n", "\n", "2019-04-01\n", "\n", "\n", "0\n", "\n", "\n", "1000\n", "\n", "\n", "2000\n", "\n", "\n", "3000\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dailymonopipeline = Pipeline(Dict(\n", " :transformers => [dailymonofilecsv,valgator,valnner,pltr]\n", " )\n", ")\n", "\n", "fit!(dailymonopipeline)\n", "transform!(dailymonopipeline)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot of daily monotonic data with Monotonicer" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2019-03-01\n", "\n", "\n", "2019-04-01\n", "\n", "\n", "0\n", "\n", "\n", "200\n", "\n", "\n", "400\n", "\n", "\n", "600\n", "\n", "\n", "800\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dailymonopipeline = Pipeline(Dict(\n", " :transformers => [dailymonofilecsv,valgator,valnner,mono,pltr]\n", " )\n", ")\n", "fit!(dailymonopipeline)\n", "transform!(dailymonopipeline)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot of daily monotonic with Monotonicer and Outliernicer" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2019-03-01\n", "\n", "\n", "2019-04-01\n", "\n", "\n", "0\n", "\n", "\n", "20\n", "\n", "\n", "40\n", "\n", "\n", "60\n", "\n", "\n", "80\n", "\n", "\n", "100\n", "\n", "\n", "120\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dailymonopipeline = Pipeline(Dict(\n", " :transformers => [dailymonofilecsv,valgator,valnner,mono,outliernicer,pltr]\n", " )\n", ")\n", "fit!(dailymonopipeline)\n", "transform!(dailymonopipeline)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot regular TS after monotonic normalization" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2014-01-01\n", "\n", "\n", "2014-04-01\n", "\n", "\n", "2014-07-01\n", "\n", "\n", "2014-10-01\n", "\n", "\n", "2015-01-01\n", "\n", "\n", "2\n", "\n", "\n", "4\n", "\n", "\n", "6\n", "\n", "\n", "8\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regpipeline = Pipeline(Dict(\n", " :transformers => [regularfilecsv,valgator,valnner,mono,pltr]\n", " )\n", ")\n", "\n", "fit!(regpipeline)\n", "transform!(regpipeline)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot of regular TS with outlier normalization" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "2014-01-01\n", "\n", "\n", "2014-04-01\n", "\n", "\n", "2014-07-01\n", "\n", "\n", "2014-10-01\n", "\n", "\n", "2015-01-01\n", "\n", "\n", "3.5\n", "\n", "\n", "4.0\n", "\n", "\n", "4.5\n", "\n", "\n", "5.0\n", "\n", "\n", "5.5\n", "\n", "\n", "6.0\n", "\n", "\n", "Date\n", "\n", "\n", "Value\n", "\n", "\n", "\n" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regpipeline = Pipeline(Dict(\n", " :transformers => [regularfilecsv,valgator,valnner,mono,outliernicer,pltr]\n", " )\n", ")\n", "fit!(regpipeline)\n", "transform!(regpipeline)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TS Discovery by automatic data type classification" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "getting stats of AirOffTemp1.csv\n", "getting stats of AirOffTemp2.csv\n", "getting stats of AirOffTemp3.csv\n", "getting stats of Energy1.csv\n", "getting stats of Energy10.csv\n", "getting stats of Energy2.csv\n", "getting stats of Energy3.csv\n", "getting stats of Energy4.csv\n", "getting stats of Energy6.csv\n", "getting stats of Energy7.csv\n", "getting stats of Energy8.csv\n", "getting stats of Energy9.csv\n", "getting stats of Pressure1.csv\n", "getting stats of Pressure3.csv\n", "getting stats of Pressure4.csv\n", "getting stats of Pressure6.csv\n", "getting stats of RetTemp11.csv\n", "getting stats of RetTemp21.csv\n", "getting stats of RetTemp41.csv\n", "getting stats of RetTemp51.csv\n", "getting stats of AirOffTemp4.csv\n", "getting stats of AirOffTemp5.csv\n", "getting stats of Energy5.csv\n", "getting stats of Pressure5.csv\n", "getting stats of RetTemp31.csv\n", "loading model from file: /Users/ppalmes/.julia/packages/TSML/lqjQn/src/../data/realdatatsclassification/model/juliarfmodel.serialized\n" ] }, { "data": { "text/html": [ "

5 rows × 2 columns

fnamepredtype
StringSubStrin…
1AirOffTemp4.csvAirOffTemp
2AirOffTemp5.csvAirOffTemp
3Energy5.csvEnergy
4Pressure5.csvPressure
5RetTemp31.csvEnergy
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& fname & predtype\\\\\n", "\t\\hline\n", "\t& String & SubStrin…\\\\\n", "\t\\hline\n", "\t1 & AirOffTemp4.csv & AirOffTemp \\\\\n", "\t2 & AirOffTemp5.csv & AirOffTemp \\\\\n", "\t3 & Energy5.csv & Energy \\\\\n", "\t4 & Pressure5.csv & Pressure \\\\\n", "\t5 & RetTemp31.csv & Energy \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "5×2 DataFrame\n", "│ Row │ fname │ predtype │\n", "│ │ \u001b[90mString\u001b[39m │ \u001b[90mSubStrin…\u001b[39m │\n", "├─────┼─────────────────┼────────────┤\n", "│ 1 │ AirOffTemp4.csv │ AirOffTemp │\n", "│ 2 │ AirOffTemp5.csv │ AirOffTemp │\n", "│ 3 │ Energy5.csv │ Energy │\n", "│ 4 │ Pressure5.csv │ Pressure │\n", "│ 5 │ RetTemp31.csv │ Energy │" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using TSML: TSClassifier\n", "Random.seed!(12)\n", "\n", "trdirname = joinpath(dirname(pathof(TSML)),\"../data/realdatatsclassification/training\")\n", "tstdirname = joinpath(dirname(pathof(TSML)),\"../data/realdatatsclassification/testing\")\n", "modeldirname = joinpath(dirname(pathof(TSML)),\"../data/realdatatsclassification/model\")\n", "\n", "tscl = TSClassifier(Dict(:trdirectory=>trdirname,\n", " :tstdirectory=>tstdirname,\n", " :modeldirectory=>modeldirname,\n", " :feature_range => 6:20,\n", " :num_trees=>10)\n", ")\n", "\n", "fit!(tscl)\n", "dfresults = transform!(tscl)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "80.0" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "apredict = dfresults.predtype\n", "fnames = dfresults.fname\n", "myregex = r\"(?[A-Z _ - a-z]+)(?\\d*).(?\\w+)\"\n", "mtypes=map(fnames) do fname\n", " mymatch=match(myregex,fname)\n", " mymatch[:dtype]\n", "end\n", "\n", "sum(mtypes .== apredict)/length(mtypes) * 100" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "@webio": { "lastCommId": null, "lastKernelId": null }, "kernelspec": { "display_name": "Julia 1.2.0", "language": "julia", "name": "julia-1.2" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.2.0" } }, "nbformat": 4, "nbformat_minor": 2 }