{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "using TSML\n", "using DataFrames\n", "using Dates\n", "using CSV\n", "using Random" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DateValNNer(nothing, Dict{Symbol,Any}(:strict => true,:dateinterval => 1 hour,:aggregator => :median,:nnsize => 1,:missdirection => :symmetric))" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fname =\"testdata.csv\"\n", "dat = CSV.read(fname)\n", "rename!(dat,names(dat)[1]=>:Date,names(dat)[2]=>:Value)\n", "dat.Date = DateTime.(dat.Date,\"d/m/y H:M\")\n", "orig = deepcopy(dat)\n", "filter1 = DateValgator()\n", "filter2 = DateValNNer(Dict(:nnsize=>1))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5×2 DataFrame\n", "│ Row │ Date │ Value │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64⍰\u001b[39m │\n", "├─────┼─────────────────────┼──────────┤\n", "│ 1 │ 2014-01-01T00:00:00 │ 10.0 │\n", "│ 2 │ 2014-01-01T01:00:00 │ 9.9 │\n", "│ 3 │ 2014-01-01T02:00:00 │ 10.0 │\n", "│ 4 │ 2014-01-01T03:00:00 │ 10.0 │\n", "│ 5 │ 2014-01-01T04:00:00 │ 10.0 │, 8761)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fit!(filter1,dat,[])\n", "res1=transform!(filter1,dat)\n", "(first(res1,5),size(res1,1))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5×2 DataFrame\n", "│ Row │ Date │ Value │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64⍰\u001b[39m │\n", "├─────┼─────────────────────┼──────────┤\n", "│ 1 │ 2014-01-01T00:00:00 │ 10.0 │\n", "│ 2 │ 2014-01-01T01:00:00 │ 9.9 │\n", "│ 3 │ 2014-01-01T02:00:00 │ 10.0 │\n", "│ 4 │ 2014-01-01T03:00:00 │ 10.0 │\n", "│ 5 │ 2014-01-01T04:00:00 │ 10.0 │, 8761)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fit!(filter2,res1,[])\n", "res2=transform!(filter2,res1)\n", "(first(res2,5),size(res2,1))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(nothing, Dict{Symbol,Union{Nothing, Array{Transformer,1}}}(:transformers => Transformer[DateValgator(Dict{Symbol,Any}(:dateinterval => 1 hour,:aggregator => :median), Dict{Symbol,Any}(:dateinterval => 1 hour,:aggregator => :median)), DateValNNer(Dict{Symbol,Any}(:loopcount => 1,:strict => true,:dateinterval => 1 hour,:missingcount => 4931,:aggregator => :median,:nnsize => 1,:missdirection => :symmetric), Dict{Symbol,Any}(:loopcount => 1,:strict => true,:dateinterval => 1 hour,:missingcount => 4931,:aggregator => :median,:nnsize => 1,:missdirection => :symmetric))],:transformer_args => nothing))" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mypipeline = Pipeline(Dict(\n", " :transformers => [filter1,filter2]\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

5 rows × 2 columns

DateValue
DateTimeFloat64⍰
12014-01-01T00:00:0010.0
22014-01-01T01:00:009.9
32014-01-01T02:00:0010.0
42014-01-01T03:00:0010.0
52014-01-01T04:00:0010.0
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& Date & Value\\\\\n", "\t\\hline\n", "\t& DateTime & Float64⍰\\\\\n", "\t\\hline\n", "\t1 & 2014-01-01T00:00:00 & 10.0 \\\\\n", "\t2 & 2014-01-01T01:00:00 & 9.9 \\\\\n", "\t3 & 2014-01-01T02:00:00 & 10.0 \\\\\n", "\t4 & 2014-01-01T03:00:00 & 10.0 \\\\\n", "\t5 & 2014-01-01T04:00:00 & 10.0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "5×2 DataFrame\n", "│ Row │ Date │ Value │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64⍰\u001b[39m │\n", "├─────┼─────────────────────┼──────────┤\n", "│ 1 │ 2014-01-01T00:00:00 │ 10.0 │\n", "│ 2 │ 2014-01-01T01:00:00 │ 9.9 │\n", "│ 3 │ 2014-01-01T02:00:00 │ 10.0 │\n", "│ 4 │ 2014-01-01T03:00:00 │ 10.0 │\n", "│ 5 │ 2014-01-01T04:00:00 │ 10.0 │" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fit!(mypipeline,dat,[])\n", "resp = transform!(mypipeline,dat)\n", "first(resp,5)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "transform! (generic function with 42 methods)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using TSML.TSMLTypes\n", "import TSML.TSMLTypes.fit!\n", "import TSML.TSMLTypes.transform!\n", "\n", "mutable struct MyCSVDateValReader <: Transformer\n", " model\n", " args\n", " function MyCSVDateValReader(args=Dict())\n", " default_args = Dict(\n", " :filename => \"\",\n", " :dateformat => \"\"\n", " )\n", " new(nothing,mergedict(default_args,args))\n", " end\n", "end\n", "function fit!(csvrdr::MyCSVDateValReader,x::T=[],y::Vector=[]) where {T<:Union{DataFrame,Vector,Matrix}}\n", " fname = csvrdr.args[:filename]\n", " fmt = csvrdr.args[:dateformat]\n", " (fname != \"\" && fmt != \"\") || error(\"missing filename or date format\")\n", " model = csvrdr.args\n", "end\n", "\n", "function transform!(csvrdr::MyCSVDateValReader,x::T=[]) where {T<:Union{DataFrame,Vector,Matrix}}\n", " fname = csvrdr.args[:filename]\n", " fmt = csvrdr.args[:dateformat]\n", " df = CSV.read(fname)\n", " ncol(df) == 2 || error(\"dataframe should have only two columns: Date,Value\")\n", " rename!(df,names(df)[1]=>:Date,names(df)[2]=>:Value)\n", " df.Date = DateTime.(df.Date,fmt)\n", " df\n", "end" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

5 rows × 2 columns

DateValue
DateTimeFloat64
12014-01-01T00:06:0010.0
22014-01-01T00:18:0010.0
32014-01-01T00:29:0010.0
42014-01-01T00:40:009.9
52014-01-01T00:51:009.9
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& Date & Value\\\\\n", "\t\\hline\n", "\t& DateTime & Float64\\\\\n", "\t\\hline\n", "\t1 & 2014-01-01T00:06:00 & 10.0 \\\\\n", "\t2 & 2014-01-01T00:18:00 & 10.0 \\\\\n", "\t3 & 2014-01-01T00:29:00 & 10.0 \\\\\n", "\t4 & 2014-01-01T00:40:00 & 9.9 \\\\\n", "\t5 & 2014-01-01T00:51:00 & 9.9 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "5×2 DataFrame\n", "│ Row │ Date │ Value │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────────────────┼─────────┤\n", "│ 1 │ 2014-01-01T00:06:00 │ 10.0 │\n", "│ 2 │ 2014-01-01T00:18:00 │ 10.0 │\n", "│ 3 │ 2014-01-01T00:29:00 │ 10.0 │\n", "│ 4 │ 2014-01-01T00:40:00 │ 9.9 │\n", "│ 5 │ 2014-01-01T00:51:00 │ 9.9 │" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csvreader = MyCSVDateValReader(Dict(:filename=>\"testdata.csv\",:dateformat=>\"d/m/y H:M\"))\n", "fit!(csvreader)\n", "res=transform!(csvreader)\n", "first(res,5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

5 rows × 2 columns

DateValue
DateTimeFloat64⍰
12014-01-01T00:00:0010.0
22014-01-01T01:00:009.9
32014-01-01T02:00:0010.0
42014-01-01T03:00:0010.0
52014-01-01T04:00:0010.0
" ], "text/latex": [ "\\begin{tabular}{r|cc}\n", "\t& Date & Value\\\\\n", "\t\\hline\n", "\t& DateTime & Float64⍰\\\\\n", "\t\\hline\n", "\t1 & 2014-01-01T00:00:00 & 10.0 \\\\\n", "\t2 & 2014-01-01T01:00:00 & 9.9 \\\\\n", "\t3 & 2014-01-01T02:00:00 & 10.0 \\\\\n", "\t4 & 2014-01-01T03:00:00 & 10.0 \\\\\n", "\t5 & 2014-01-01T04:00:00 & 10.0 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "5×2 DataFrame\n", "│ Row │ Date │ Value │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64⍰\u001b[39m │\n", "├─────┼─────────────────────┼──────────┤\n", "│ 1 │ 2014-01-01T00:00:00 │ 10.0 │\n", "│ 2 │ 2014-01-01T01:00:00 │ 9.9 │\n", "│ 3 │ 2014-01-01T02:00:00 │ 10.0 │\n", "│ 4 │ 2014-01-01T03:00:00 │ 10.0 │\n", "│ 5 │ 2014-01-01T04:00:00 │ 10.0 │" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mypipeline = Pipeline(Dict(\n", " :transformers => [csvreader,filter1,filter2]\n", " )\n", ")\n", "fit!(mypipeline)\n", "res=transform!(mypipeline)\n", "first(res,5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4931" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filter2.args[:missingcount]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "mydate=DateTime(2014,1,1):Dates.Minute(15):DateTime(2014,1,3) # 15 minutes interval\n", "values = Array{Union{Float64,Missing}}(sin.(1:0.1:length(mydate)) .+ cos.(1:0.1:length(mydate)))[1:length(mydate)];\n", "x = DataFrame(Date=mydate,Value=values); xx = deepcopy(x);first(x,10)\n", "nmissing=floor(nrow(x) * 0.80) |> Integer\n", "ndxmissing=Random.shuffle(1:nrow(x))[1:nmissing]\n", "x.Value[ndxmissing] .= missing; first(x,15)\n", "dvtr = TSML.DateValgator()\n", "TSML.fit!(dvtr,x,[]);TSML.fit!(dvtr,xx,[])\n", "inputx = TSML.transform!(dvtr,x); inputxx = TSML.transform!(dvtr,xx);\n", "dvnnr = TSML.DateValNNer(Dict(:dateinterval=>Dates.Hour(1),:nnsize=>1))\n", "TSML.fit!(dvnnr,inputx,[])\n", "pred_y=TSML.transform!(dvnnr,inputx);pred_yy=TSML.transform!(dvnnr,inputxx);" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.522726120677754" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sqrt(sum(pred_y.Value-pred_yy.Value).^2)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "rename!(pred_y,:Value=>:MissingVals);" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

49 rows × 4 columns

DateMissingValsValueerror
DateTimeFloat64⍰Float64⍰Float64
12014-01-01T00:00:000.7466461.363290.380248
22014-01-01T01:00:000.7466461.193240.199444
32014-01-01T02:00:000.7466460.8047330.00337414
42014-01-01T03:00:00-0.1054540.2891790.155736
52014-01-01T04:00:00-0.957554-0.2720290.469945
62014-01-01T05:00:00-0.957554-0.7902910.0279772
72014-01-01T06:00:00-1.18563-1.183783.40069e-6
82014-01-01T07:00:00-1.4137-1.390380.000543709
92014-01-01T08:00:00-1.38614-1.377477.52128e-5
102014-01-01T09:00:00-1.05908-1.147080.00774523
112014-01-01T10:00:00-0.63353-0.7356010.0104185
122014-01-01T11:00:00-0.207982-0.2079820.0
132014-01-01T12:00:000.5268960.3524720.0304236
142014-01-01T13:00:001.261770.8572790.163616
152014-01-01T14:00:001.261771.226740.00122732
162014-01-01T15:00:000.6587261.402530.55324
172014-01-01T16:00:000.05567711.356881.69314
182014-01-01T17:00:000.05567711.097021.0844
192014-01-01T18:00:000.05567710.6639610.370009
202014-01-01T19:00:000.05567710.1260760.00495603
212014-01-01T20:00:00-0.499012-0.4317130.00452912
222014-01-01T21:00:00-0.956576-0.9213440.00124128
232014-01-01T22:00:00-1.41414-1.265520.0220894
242014-01-01T23:00:00-1.41414-1.407084.99116e-5
252014-01-02T00:00:00-1.30811-1.331670.000555052
262014-01-02T01:00:00-0.303306-1.043210.547464
272014-01-02T02:00:000.701501-0.5900561.66812
282014-01-02T03:00:000.701501-0.04373990.555384
292014-01-02T04:00:000.7015010.5094820.0368714
302014-01-02T05:00:001.033060.9822670.00257969
" ], "text/latex": [ "\\begin{tabular}{r|cccc}\n", "\t& Date & MissingVals & Value & error\\\\\n", "\t\\hline\n", "\t& DateTime & Float64⍰ & Float64⍰ & Float64\\\\\n", "\t\\hline\n", "\t1 & 2014-01-01T00:00:00 & 0.746646 & 1.36329 & 0.380248 \\\\\n", "\t2 & 2014-01-01T01:00:00 & 0.746646 & 1.19324 & 0.199444 \\\\\n", "\t3 & 2014-01-01T02:00:00 & 0.746646 & 0.804733 & 0.00337414 \\\\\n", "\t4 & 2014-01-01T03:00:00 & -0.105454 & 0.289179 & 0.155736 \\\\\n", "\t5 & 2014-01-01T04:00:00 & -0.957554 & -0.272029 & 0.469945 \\\\\n", "\t6 & 2014-01-01T05:00:00 & -0.957554 & -0.790291 & 0.0279772 \\\\\n", "\t7 & 2014-01-01T06:00:00 & -1.18563 & -1.18378 & 3.40069e-6 \\\\\n", "\t8 & 2014-01-01T07:00:00 & -1.4137 & -1.39038 & 0.000543709 \\\\\n", "\t9 & 2014-01-01T08:00:00 & -1.38614 & -1.37747 & 7.52128e-5 \\\\\n", "\t10 & 2014-01-01T09:00:00 & -1.05908 & -1.14708 & 0.00774523 \\\\\n", "\t11 & 2014-01-01T10:00:00 & -0.63353 & -0.735601 & 0.0104185 \\\\\n", "\t12 & 2014-01-01T11:00:00 & -0.207982 & -0.207982 & 0.0 \\\\\n", "\t13 & 2014-01-01T12:00:00 & 0.526896 & 0.352472 & 0.0304236 \\\\\n", "\t14 & 2014-01-01T13:00:00 & 1.26177 & 0.857279 & 0.163616 \\\\\n", "\t15 & 2014-01-01T14:00:00 & 1.26177 & 1.22674 & 0.00122732 \\\\\n", "\t16 & 2014-01-01T15:00:00 & 0.658726 & 1.40253 & 0.55324 \\\\\n", "\t17 & 2014-01-01T16:00:00 & 0.0556771 & 1.35688 & 1.69314 \\\\\n", "\t18 & 2014-01-01T17:00:00 & 0.0556771 & 1.09702 & 1.0844 \\\\\n", "\t19 & 2014-01-01T18:00:00 & 0.0556771 & 0.663961 & 0.370009 \\\\\n", "\t20 & 2014-01-01T19:00:00 & 0.0556771 & 0.126076 & 0.00495603 \\\\\n", "\t21 & 2014-01-01T20:00:00 & -0.499012 & -0.431713 & 0.00452912 \\\\\n", "\t22 & 2014-01-01T21:00:00 & -0.956576 & -0.921344 & 0.00124128 \\\\\n", "\t23 & 2014-01-01T22:00:00 & -1.41414 & -1.26552 & 0.0220894 \\\\\n", "\t24 & 2014-01-01T23:00:00 & -1.41414 & -1.40708 & 4.99116e-5 \\\\\n", "\t25 & 2014-01-02T00:00:00 & -1.30811 & -1.33167 & 0.000555052 \\\\\n", "\t26 & 2014-01-02T01:00:00 & -0.303306 & -1.04321 & 0.547464 \\\\\n", "\t27 & 2014-01-02T02:00:00 & 0.701501 & -0.590056 & 1.66812 \\\\\n", "\t28 & 2014-01-02T03:00:00 & 0.701501 & -0.0437399 & 0.555384 \\\\\n", "\t29 & 2014-01-02T04:00:00 & 0.701501 & 0.509482 & 0.0368714 \\\\\n", "\t30 & 2014-01-02T05:00:00 & 1.03306 & 0.982267 & 0.00257969 \\\\\n", "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "49×4 DataFrame\n", "│ Row │ Date │ MissingVals │ Value │ error │\n", "│ │ \u001b[90mDateTime\u001b[39m │ \u001b[90mFloat64⍰\u001b[39m │ \u001b[90mFloat64⍰\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", "├─────┼─────────────────────┼─────────────┼───────────┼─────────────┤\n", "│ 1 │ 2014-01-01T00:00:00 │ 0.746646 │ 1.36329 │ 0.380248 │\n", "│ 2 │ 2014-01-01T01:00:00 │ 0.746646 │ 1.19324 │ 0.199444 │\n", "│ 3 │ 2014-01-01T02:00:00 │ 0.746646 │ 0.804733 │ 0.00337414 │\n", "│ 4 │ 2014-01-01T03:00:00 │ -0.105454 │ 0.289179 │ 0.155736 │\n", "│ 5 │ 2014-01-01T04:00:00 │ -0.957554 │ -0.272029 │ 0.469945 │\n", "│ 6 │ 2014-01-01T05:00:00 │ -0.957554 │ -0.790291 │ 0.0279772 │\n", "│ 7 │ 2014-01-01T06:00:00 │ -1.18563 │ -1.18378 │ 3.40069e-6 │\n", "│ 8 │ 2014-01-01T07:00:00 │ -1.4137 │ -1.39038 │ 0.000543709 │\n", "│ 9 │ 2014-01-01T08:00:00 │ -1.38614 │ -1.37747 │ 7.52128e-5 │\n", "│ 10 │ 2014-01-01T09:00:00 │ -1.05908 │ -1.14708 │ 0.00774523 │\n", "⋮\n", "│ 39 │ 2014-01-02T14:00:00 │ -1.38786 │ -1.33 │ 0.00334763 │\n", "│ 40 │ 2014-01-02T15:00:00 │ -1.41018 │ -1.40712 │ 9.38735e-6 │\n", "│ 41 │ 2014-01-02T16:00:00 │ -1.23038 │ -1.26773 │ 0.00139453 │\n", "│ 42 │ 2014-01-02T17:00:00 │ -0.730748 │ -0.925126 │ 0.0377827 │\n", "│ 43 │ 2014-01-02T18:00:00 │ -0.231113 │ -0.436468 │ 0.0421707 │\n", "│ 44 │ 2014-01-02T19:00:00 │ 0.552253 │ 0.121099 │ 0.185894 │\n", "│ 45 │ 2014-01-02T20:00:00 │ 1.33562 │ 0.659546 │ 0.457074 │\n", "│ 46 │ 2014-01-02T21:00:00 │ 1.33562 │ 1.09387 │ 0.0584445 │\n", "│ 47 │ 2014-01-02T22:00:00 │ 1.33562 │ 1.35549 │ 0.000394795 │\n", "│ 48 │ 2014-01-02T23:00:00 │ 1.41122 │ 1.40311 │ 6.58315e-5 │\n", "│ 49 │ 2014-01-03T00:00:00 │ 1.19439 │ 1.26403 │ 0.00484856 │" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jx = join(pred_y,pred_yy,on=:Date)\n", "jx.error = (jx.MissingVals .- jx.Value).^2;jx" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "@webio": { "lastCommId": null, "lastKernelId": null }, "kernelspec": { "display_name": "Julia 1.2.0", "language": "julia", "name": "julia-1.2" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.2.0" } }, "nbformat": 4, "nbformat_minor": 2 }