// cpu_alert_stream // metric: usage_user // available_fields: "usage_guest","usage_guest_nice","usage_idle","usage_iowait", "usage_irq","usage_nice","usage_softirq","usage_steal","usage_system" // TELEGRAF CONFIGURATION // [[inputs.cpu]] // percpu = true // totalcpu = true // fielddrop = ["time_*"] // DEFINE: kapacitor define cpu_alert_stream -type stream -tick cpu/cpu_alert_stream.tick -dbrp telegraf.autogen // ENABLE: kapacitor enable cpu_alert_stream // Parameters var info = 70 var warn = 80 var crit = 90 var infoSig = 2.5 var warnSig = 3 var critSig = 3.5 var period = 10s var every = 10s // Dataframe var data = stream |from() .database('telegraf') .retentionPolicy('autogen') .measurement('cpu') .groupBy('host') .where(lambda: "cpu" == 'cpu-total') |eval(lambda: 100.0 - "usage_idle") .as('used') |window() .period(period) .every(every) |mean('used') .as('stat') // Thresholds var alert = data |eval(lambda: sigma("stat")) .as('sigma') .keep() |alert() .id('{{ index .Tags "host"}}/cpu_used') .message('{{ .ID }}:{{ index .Fields "stat" }}') .info(lambda: "stat" > info OR "sigma" > infoSig) .warn(lambda: "stat" > warn OR "sigma" > warnSig) .crit(lambda: "stat" > crit OR "sigma" > critSig) // Alert alert .log('/tmp/cpu_alert_log.txt')