{ "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": false, "gnetId": null, "graphTooltip": 0, "links": [], "panels": [ { "collapsed": false, "datasource": null, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 19, "panels": [], "repeat": "hostname", "title": "$hostname Metrics", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 }, "hiddenSeries": false, "id": 21, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": null, "repeatDirection": "h", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_load1{instance=\"$hostname:9100\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "1min", "refId": "A" }, { "expr": "node_load5{instance=\"$hostname:9100\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "5min", "refId": "B" }, { "expr": "node_load15{instance=\"$hostname:9100\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "15min", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "System Load", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 5, "w": 8, "x": 6, "y": 1 }, "hiddenSeries": false, "id": 24, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"$hostname:9400\"}", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Power Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "watt", "gauge": { "maxValue": 2400, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 5, "w": 3, "x": 14, "y": 1 }, "id": 30, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=\"$hostname:9400\"})", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "1800,2200", "title": "GPU Power Total", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 5, "w": 7, "x": 17, "y": 1 }, "hiddenSeries": false, "id": 35, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "1e6*sum(DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{instance=\"$hostname:9400\"})", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "Total", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "NVLINK Bandwidth", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 5, "w": 6, "x": 0, "y": 6 }, "hiddenSeries": false, "id": 46, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "node_memory_MemTotal_bytes{instance=\"$hostname:9100\"} - node_memory_MemFree_bytes{instance=\"$hostname:9100\"} - node_memory_Buffers_bytes{instance=\"$hostname:9100\"} - node_memory_Cached_bytes{instance=\"$hostname:9100\"}", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, "legendFormat": "Used", "refId": "A" }, { "expr": "node_memory_Buffers_bytes{instance=\"$hostname:9100\"}", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, "legendFormat": "Buffered", "refId": "B" }, { "expr": "node_memory_Cached_bytes{instance=\"$hostname:9100\"}", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "Cached", "refId": "C" }, { "expr": "node_memory_MemFree_bytes{instance=\"$hostname:9100\"}", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, "legendFormat": "Free", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "System Memory Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "gridPos": { "h": 5, "w": 8, "x": 6, "y": 6 }, "hiddenSeries": false, "id": 25, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_GPU_TEMP{instance=\"$hostname:9400\"}", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Temperature", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "celsius", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "description": "GPU Shutdown Temp : 90 C\nGPU Slowdown Temp : 87 C\nGPU Max Operating Temp : 83 C", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "celsius", "gauge": { "maxValue": 90, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 5, "w": 3, "x": 14, "y": 6 }, "id": 31, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=\"$hostname:9400\"})", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "83,87", "title": "GPU Avg. Temp", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 5, "w": 7, "x": 17, "y": 6 }, "hiddenSeries": false, "id": 36, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "Rx", "transform": "negative-Y" } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(DCGM_FI_PROF_PCIE_TX_BYTES{instance=\"$hostname:9400\"})", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "Tx", "refId": "A" }, { "expr": "sum(DCGM_FI_PROF_PCIE_RX_BYTES{instance=\"$hostname:9400\"})", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "Rx", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PCIe Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "percentunit", "gauge": { "maxValue": 1, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 5, "w": 3, "x": 0, "y": 11 }, "id": 27, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\",instance=\"$hostname:9100\"}) - sum(node_filesystem_avail_bytes{device!=\"rootfs\",instance=\"$hostname:9100\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\",instance=\"$hostname:9100\"})", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "", "refId": "C" } ], "thresholds": "0.75,0.9", "title": "Disk Usage", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 5, "w": 3, "x": 3, "y": 11 }, "id": 28, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "{instance=\"$hostname:9100\", job=\"cluster\", module=\"node-exporter\"}", "targets": [ { "expr": "((node_memory_MemTotal_bytes{instance=\"$hostname:9100\"} - node_memory_MemFree_bytes{instance=\"$hostname:9100\"} - node_memory_Buffers_bytes{instance=\"$hostname:9100\"} - node_memory_Cached_bytes{instance=\"$hostname:9100\"}) / node_memory_MemTotal_bytes{instance=\"$hostname:9100\"}) * 100", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "", "refId": "A" } ], "thresholds": "80,90", "title": "Memory Usage", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "gridPos": { "h": 5, "w": 8, "x": 6, "y": 11 }, "hiddenSeries": false, "id": 57, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"$hostname:9400\"}", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": null, "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "description": "", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 5, "w": 3, "x": 14, "y": 11 }, "id": 58, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "avg(DCGM_FI_DEV_GPU_UTIL{instance=\"$hostname:9400\"})", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "80,90", "title": "GPU Total Utilization", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 5, "w": 7, "x": 17, "y": 11 }, "hiddenSeries": false, "id": 37, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "Receive", "transform": "negative-Y" } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(node_network_transmit_bytes_total{instance=\"$hostname:9100\",device=~\"en.*\"}[$__interval]))", "format": "time_series", "hide": false, "interval": ".5m", "intervalFactor": 2, "legendFormat": "Transmit", "refId": "B" }, { "expr": "sum(rate(node_network_receive_bytes_total{instance=\"$hostname:9100\",device=~\"en.*\"}[$__interval]))", "format": "time_series", "hide": false, "instant": false, "interval": ".5m", "intervalFactor": 2, "legendFormat": "Receive", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ethernet Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 5, "w": 6, "x": 0, "y": 16 }, "hiddenSeries": false, "id": 38, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "I/O time", "yaxis": 2 } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_read_bytes_total{instance=\"$hostname:9100\"}[$__interval])", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 4, "legendFormat": "Read ({{device}})", "refId": "E" }, { "expr": "rate(node_disk_written_bytes_total{instance=\"$hostname:9100\"}[$__interval])", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 4, "legendFormat": "Write ({{device}})", "refId": "F" }, { "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$hostname:9100\"}[$__interval]))", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 4, "legendFormat": "I/O time", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "ms", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "gridPos": { "h": 5, "w": 8, "x": 6, "y": 16 }, "hiddenSeries": false, "id": 39, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{instance=\"$hostname:9400\"}", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Mem Cpy Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": null, "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "description": "", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 5, "w": 3, "x": 14, "y": 16 }, "id": 40, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "avg(DCGM_FI_DEV_MEM_COPY_UTIL{instance=\"$hostname:9400\"})", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "70,90", "title": "GPU Avg. Mem. Cpy Utilization", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 5, "w": 7, "x": 17, "y": 16 }, "hiddenSeries": false, "id": 41, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "Receive", "transform": "negative-Y" } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(node_infiniband_port_data_transmitted_bytes_total{instance=\"$hostname:9100\"}[$__interval]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "Transmit", "refId": "B" }, { "expr": "sum(rate(node_infiniband_port_data_received_bytes_total{instance=\"$hostname:9100\"}[$__interval]))", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, "legendFormat": "Receive", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Infiniband Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "hertz", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 2, "w": 3, "x": 0, "y": 21 }, "id": 59, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "nodename", "targets": [ { "expr": "node_uname_info{instance=\"$hostname:9100\"}", "format": "table", "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "title": "Hostname", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "hertz", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 2, "w": 3, "x": 3, "y": 21 }, "id": 60, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "release", "targets": [ { "expr": "node_uname_info{instance=\"$hostname:9100\"}", "format": "table", "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "title": "Kernel", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "gridPos": { "h": 5, "w": 8, "x": 6, "y": 21 }, "hiddenSeries": false, "id": 42, "legend": { "avg": false, "current": true, "max": false, "min": false, "show": false, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_FB_USED{instance=\"$hostname:9400\"}/(DCGM_FI_DEV_FB_USED{instance=\"$hostname:9400\"}+DCGM_FI_DEV_FB_FREE{instance=\"$hostname:9400\"})", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Mem Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "watt", "gauge": { "maxValue": 3200, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 5, "w": 3, "x": 14, "y": 21 }, "id": 54, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "node_hwmon_power_average_watt{chip=\"lnxsybus:00_acpi000d:00\", instance=\"$hostname:9100\", job=\"cluster\", module=\"node-exporter\", sensor=\"power1\"}", "targets": [ { "expr": "node_hwmon_power_average_watt{instance=\"$hostname:9100\"}", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "thresholds": "2600,3000", "title": "System Total Power Draw", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": { "PSU1 Input": "#6ed0e0", "PSU2 Input": "#eab839", "PSU3 Input": "#ef843c", "PSU4 Input": "#d683ce" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 5, "w": 7, "x": 17, "y": 21 }, "hiddenSeries": false, "id": 52, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_hwmon_power_average_watt{instance=\"$hostname:9100\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Total", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "System Power Draw", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "watt", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "hertz", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 2, "w": 3, "x": 0, "y": 23 }, "id": 44, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "avg(DCGM_FI_DEV_SM_CLOCK{instance=\"$hostname:9400\"}*1000000)", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "", "title": "GPU SM Clocks", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "format": "hertz", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 2, "w": 3, "x": 3, "y": 23 }, "id": 45, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "avg(DCGM_FI_DEV_MEM_CLOCK{instance=\"$hostname:9400\"}*1000000)", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "", "title": "GPU Memory Clocks", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" } ], "refresh": "30s", "schemaVersion": 25, "style": "dark", "tags": [ "cluster", "GPU" ], "templating": { "list": [ { "current": { "text": "Prometheus", "value": "Prometheus" }, "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "datasource", "options": [], "query": "prometheus", "refresh": 1, "regex": "", "skipUrlSync": false, "type": "datasource" }, { "allValue": null, "current": { "selected": true, "text": "$hostname", "value": "$hostname" }, "datasource": "$datasource", "definition": "", "hide": 0, "includeAll": false, "label": "server", "multi": false, "name": "hostname", "options": [], "query": "query_result(label_replace(DCGM_FI_DEV_POWER_USAGE{instance=~\".*:9400\"}, \"instance\", \"$1\", \"instance\", \"(.*):9400.*\"))", "refresh": 1, "regex": "/.*instance=\"([^\"]*).*/", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-15m", "to": "now" }, "timepicker": { "refresh_intervals": [ "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "GPU Nodes", "uid": "syl6zhqkz", "version": 1 }