{ "__inputs": [ { "name": "DS_PROMETHEUS", "label": "Prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "panel", "id": "gauge", "name": "Gauge", "version": "" }, { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.7.3" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "$$hashKey": "object:192", "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster", "editable": true, "gnetId": 12239, "graphTooltip": 0, "id": null, "iteration": 1588401887165, "links": [], "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 18, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 12, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "instant": false, "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Temperature", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "celsius", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "datasource": "${DS_PROMETHEUS}", "gridPos": { "h": 8, "w": 6, "x": 18, "y": 0 }, "id": 14, "options": { "fieldOptions": { "calcs": [ "mean" ], "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "#EAB839", "value": 83 }, { "color": "red", "value": 87 } ] }, "unit": "celsius" }, "overrides": [], "values": false }, "orientation": "auto", "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "6.7.3", "targets": [ { "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], "timeFrom": null, "timeShift": null, "title": "GPU Avg. Temp", "type": "gauge" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 18, "x": 0, "y": 8 }, "hiddenSeries": false, "id": 10, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pluginVersion": "6.5.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Power Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "datasource": "${DS_PROMETHEUS}", "gridPos": { "h": 8, "w": 6, "x": 18, "y": 8 }, "id": 16, "links": [], "options": { "fieldOptions": { "calcs": [ "sum" ], "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 2400, "min": 0, "nullValueMode": "connected", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "#EAB839", "value": 1800 }, { "color": "red", "value": 2200 } ] }, "unit": "watt" }, "overrides": [], "values": false }, "orientation": "horizontal", "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "6.7.3", "targets": [ { "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], "timeFrom": null, "timeShift": null, "title": "GPU Power Total", "type": "gauge" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "hiddenSeries": false, "id": 2, "interval": "", "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU SM Clocks", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "hertz", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "hiddenSeries": false, "id": 6, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": null, "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }, "hiddenSeries": false, "id": 18, "legend": { "avg": true, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Framebuffer Mem Used", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decmbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "hiddenSeries": false, "id": 4, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Tensor Core Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": "1", "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "refresh": false, "schemaVersion": 22, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", "hide": 0, "includeAll": false, "label": null, "multi": true, "name": "instance", "options": [], "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", "definition": "label_values(gpu)", "hide": 0, "includeAll": true, "index": -1, "label": null, "multi": true, "name": "gpu", "options": [], "query": "label_values(gpu)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-15m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ] }, "timezone": "", "title": "NVIDIA DCGM Exporter Dashboard", "uid": "Oxed_c6Wz", "variables": { "list": [] }, "version": 1 }