{ "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": 165, "links": [], "panels": [ { "alert": { "alertRuleTags": {}, "conditions": [ { "evaluator": { "params": [ 90 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "keep_state", "for": "1m", "frequency": "1m", "handler": 1, "message": "https://oapi.dingtalk.com/robot/send?access_token=bd7a2d279e624eccf6444057e17a268f5bf18f0a7ec0f4bf4fda39c5643d8f62", "name": "RANKING-CPU alert", "noDataState": "keep_state", "notifications": [ { "uid": "xJuGLh0Wk" } ] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "new-influx", "fieldConfig": { "defaults": { "custom": {}, "links": [], "unit": "percent" }, "overrides": [] }, "fill": 5, "fillGradient": 0, "gridPos": { "h": 10, "w": 12, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": false, "show": true, "sort": "max", "sortDesc": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.3.7", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "alias": "", "groupBy": [ { "params": [ "$__interval" ], "type": "time" }, { "params": [ "null" ], "type": "fill" } ], "hide": false, "orderByTime": "ASC", "policy": "default", "query": "SELECT (mean(\"value\")) as power_usage FROM \"autogen\".\"dcgm_collectd_value\" WHERE (\"type\" = 'gpu_utilization' AND \"host\" =~ /-newgpu/ AND \"host\" !~ /-convert/ )AND $timeFilter GROUP BY time(1m), \"host\" fill(none)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", "select": [ [ { "params": [ "value" ], "type": "field" }, { "params": [], "type": "mean" } ] ], "tags": [] } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 90 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU-UTILIZATION", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": true, "alignLevel": null } }, { "alert": { "alertRuleTags": {}, "conditions": [ { "evaluator": { "params": [ 1000000 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" }, { "evaluator": { "params": [ 1000000 ], "type": "gt" }, "operator": { "type": "or" }, "query": { "params": [ "B", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "keep_state", "for": "1000m", "frequency": "1m", "handler": 1, "message": "https://oapi.dingtalk.com/robot/send?access_token=bd7a2d279e624eccf6444057e17a268f5bf18f0a7ec0f4bf4fda39c5643d8f62", "name": "RANKING-CPU alert", "noDataState": "keep_state", "notifications": [ { "uid": "xJuGLh0Wk" } ] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "new-influx", "fieldConfig": { "defaults": { "custom": {}, "links": [], "unit": "kbytes" }, "overrides": [] }, "fill": 5, "fillGradient": 0, "gridPos": { "h": 7, "w": 6, "x": 12, "y": 0 }, "hiddenSeries": false, "id": 12, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": false, "show": true, "sort": "max", "sortDesc": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.3.7", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "groupBy": [ { "params": [ "$__interval" ], "type": "time" }, { "params": [ "null" ], "type": "fill" } ], "hide": false, "orderByTime": "ASC", "policy": "default", "query": "SELECT (mean(\"value\")) as pcie_rx FROM \"autogen\".\"dcgm_collectd_value\" WHERE (\"type\" = 'pcie_rx_throughput' AND \"host\" =~ /-newgpu/ AND \"host\" !~ /-convert/ )AND $timeFilter GROUP BY time(1m), \"host\" fill(none)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", "select": [ [ { "params": [ "value" ], "type": "field" }, { "params": [], "type": "mean" } ] ], "tags": [] }, { "groupBy": [ { "params": [ "$__interval" ], "type": "time" }, { "params": [ "null" ], "type": "fill" } ], "hide": false, "orderByTime": "ASC", "policy": "default", "query": "SELECT (mean(\"value\")) as pcie_tx FROM \"autogen\".\"dcgm_collectd_value\" WHERE (\"type\" = 'pcie_tx_throughput' AND \"host\" =~ /-newgpu/ AND \"host\" !~ /-convert/ )AND $timeFilter GROUP BY time(1m), \"host\" fill(none)", "rawQuery": true, "refId": "B", "resultFormat": "time_series", "select": [ [ { "params": [ "value" ], "type": "field" }, { "params": [], "type": "mean" } ] ], "tags": [] } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1000000 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU-PCIE-IO", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "kbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": true, "alignLevel": null } }, { "alert": { "alertRuleTags": {}, "conditions": [ { "evaluator": { "params": [ 90 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "keep_state", "for": "1m", "frequency": "1m", "handler": 1, "message": "https://oapi.dingtalk.com/robot/send?access_token=bd7a2d279e624eccf6444057e17a268f5bf18f0a7ec0f4bf4fda39c5643d8f62", "name": "RANKING-CPU alert", "noDataState": "keep_state", "notifications": [ { "uid": "xJuGLh0Wk" } ] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "new-influx", "fieldConfig": { "defaults": { "custom": {}, "links": [], "unit": "celsius" }, "overrides": [] }, "fill": 5, "fillGradient": 0, "gridPos": { "h": 7, "w": 6, "x": 18, "y": 0 }, "hiddenSeries": false, "id": 4, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": false, "show": true, "sort": "max", "sortDesc": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.3.7", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "groupBy": [ { "params": [ "$__interval" ], "type": "time" }, { "params": [ "null" ], "type": "fill" } ], "hide": false, "orderByTime": "ASC", "policy": "default", "query": "SELECT (mean(\"value\")) as power_usage FROM \"autogen\".\"dcgm_collectd_value\" WHERE (\"type\" = 'gpu_temp' AND \"host\" =~ /-newgpu/ AND \"host\" !~ /-convert/ )AND $timeFilter GROUP BY time(1m), \"host\" fill(none)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", "select": [ [ { "params": [ "value" ], "type": "field" }, { "params": [], "type": "mean" } ] ], "tags": [] } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 90 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU-TEMP", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "celsius", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": true, "alignLevel": null } }, { "alert": { "alertRuleTags": {}, "conditions": [ { "evaluator": { "params": [ 178770 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "keep_state", "for": "1m", "frequency": "1m", "handler": 1, "message": "https://oapi.dingtalk.com/robot/send?access_token=bd7a2d279e624eccf6444057e17a268f5bf18f0a7ec0f4bf4fda39c5643d8f62", "name": "MEM USAGE", "noDataState": "keep_state", "notifications": [ { "uid": "xJuGLh0Wk" } ] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "new-influx", "fieldConfig": { "defaults": { "custom": {}, "links": [], "unit": "mbytes" }, "overrides": [] }, "fill": 2, "fillGradient": 5, "gridPos": { "h": 10, "w": 12, "x": 12, "y": 7 }, "hiddenSeries": false, "id": 8, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": false, "show": true, "sort": "max", "sortDesc": false, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.3.7", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "groupBy": [ { "params": [ "$__interval" ], "type": "time" }, { "params": [ "null" ], "type": "fill" } ], "hide": false, "orderByTime": "ASC", "policy": "default", "query": "SELECT (mean(\"value\")) as power_usage FROM \"autogen\".\"dcgm_collectd_value\" WHERE (\"type\" = 'fb_used' AND \"host\" =~ /-newgpu/ AND \"host\" !~ /-convert/ )AND $timeFilter GROUP BY time(1m), \"host\" fill(none)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", "select": [ [ { "params": [ "value" ], "type": "field" }, { "params": [], "type": "mean" } ] ], "tags": [] } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 178770 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU-POWER-USAGE", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "mbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": true, "alignLevel": null } }, { "alert": { "alertRuleTags": {}, "conditions": [ { "evaluator": { "params": [ 150 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "keep_state", "for": "1m", "frequency": "1m", "handler": 1, "message": "https://oapi.dingtalk.com/robot/send?access_token=bd7a2d279e624eccf6444057e17a268f5bf18f0a7ec0f4bf4fda39c5643d8f62", "name": "RANKING-CPU alert", "noDataState": "keep_state", "notifications": [ { "uid": "xJuGLh0Wk" } ] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "new-influx", "fieldConfig": { "defaults": { "custom": {}, "links": [], "unit": "watt" }, "overrides": [] }, "fill": 5, "fillGradient": 10, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 10 }, "hiddenSeries": false, "id": 10, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": false, "show": true, "sort": "max", "sortDesc": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.3.7", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "groupBy": [ { "params": [ "$__interval" ], "type": "time" }, { "params": [ "null" ], "type": "fill" } ], "hide": false, "orderByTime": "ASC", "policy": "default", "query": "SELECT (mean(\"value\")) as power_usage FROM \"autogen\".\"dcgm_collectd_value\" WHERE (\"type\" = 'power_usage' AND \"host\" =~ /-newgpu/ AND \"host\" !~ /-convert/ )AND $timeFilter GROUP BY time(1m), \"host\" fill(none)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", "select": [ [ { "params": [ "value" ], "type": "field" }, { "params": [], "type": "mean" } ] ], "tags": [] } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 150 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU-POWER-USAGE", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": true, "alignLevel": null } } ], "refresh": "10s", "schemaVersion": 26, "style": "dark", "tags": [ "NVIDIA", "GPU", "USAGE" ], "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "OMG-GPU-DASHBOARD", "uid": "UwPhe2dnk", "version": 24 }