{ "__inputs": [ { "name": "DS_PROMETHEUS", "label": "Prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" }, { "name": "DS_EXPRESSION", "label": "Expression", "description": "", "type": "datasource", "pluginId": "__expr__" } ], "__elements": {}, "__requires": [ { "type": "datasource", "id": "__expr__", "version": "1.0.0" }, { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "11.2.2" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "stat", "name": "Stat", "version": "" }, { "type": "panel", "id": "table", "name": "Table", "version": "" }, { "type": "panel", "id": "text", "name": "Text", "version": "" }, { "type": "panel", "id": "timeseries", "name": "Time series", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": null, "links": [], "panels": [ { "description": "Name of the job ID or pod", "gridPos": { "h": 4, "w": 5, "x": 0, "y": 0 }, "id": 2, "options": { "code": { "language": "plaintext", "showLineNumbers": false, "showMiniMap": false }, "content": "${g_job_id}\n${g_pod}", "mode": "markdown" }, "pluginVersion": "11.2.2", "title": "Name", "type": "text" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "% of VRAM used across all GPUs this job or pod is running on", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 5, "y": 0 }, "id": 10, "maxDataPoints": 60, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "expr": "sum(${g_metrics_prefix}gpu_used_vram{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_total_vram{job_id!=\"\", job_id=\"$g_job_id\"})", "hide": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_used_vram{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "C", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_total_vram{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "D", "useBackend": false }, { "datasource": { "type": "__expr__", "uid": "${DS_EXPRESSION}" }, "expression": "$A/$B", "hide": false, "refId": "Job VRAM Usage", "type": "math" }, { "datasource": { "type": "__expr__", "uid": "${DS_EXPRESSION}" }, "expression": "$C/$D", "hide": false, "refId": "Pod VRAM Usage", "type": "math" } ], "title": "Memory Usage", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Maximum PCIe speed", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "MBs" }, "overrides": [] }, "gridPos": { "h": 3, "w": 4, "x": 8, "y": 0 }, "id": 12, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "max(${g_metrics_prefix}pcie_max_speed{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{job_id}}", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "max(${g_metrics_prefix}pcie_max_speed{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{pod}}", "range": true, "refId": "B", "useBackend": false } ], "title": "PCIe Max Speed", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 4, "x": 12, "y": 0 }, "id": 15, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_recovery_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "Recovery", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_recovery_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "Recovery", "range": true, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_replay_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", "hide": false, "instant": false, "legendFormat": "Replay", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_replay_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", "hide": false, "instant": false, "legendFormat": "Replay", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_replay_rollover_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", "hide": false, "instant": false, "legendFormat": "Replay Rollover", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_replay_rollover_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", "hide": false, "instant": false, "legendFormat": "Replay Rollover", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_nack_received_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", "hide": false, "instant": false, "legendFormat": "NACK Received", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_nack_received_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", "hide": false, "instant": false, "legendFormat": "NACK Received", "range": true, "refId": "H" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_nack_sent_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", "hide": false, "instant": false, "legendFormat": "NACK Sent", "range": true, "refId": "I" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(delta(${g_metrics_prefix}pcie_nack_sent_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", "hide": false, "instant": false, "legendFormat": "NACK Sent", "range": true, "refId": "J" } ], "title": "PCIe Counts", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Top 5 current GPU GFX Activity, labeled with hostname and GPU ID.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "targetBlank": true, "title": "Go to GPU Dashboard", "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__field.labels.gpu_uuid}&var-g_hostname=${__field.labels.hostname}&var-g_gpu_id=${__field.labels.gpu_id}" } ], "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 12, "w": 4, "x": 16, "y": 0 }, "id": 16, "maxDataPoints": 60, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "max" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "topk(5, ${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "{{hostname}} | {{gpu_id}} | {{xcc_index}}", "range": false, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "topk(5, ${g_metrics_prefix}gpu_gfx_busy_instantaneous{job_id!=\"\", job_id=\"$g_job_id\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "{{hostname}} | {{gpu_id}} | {{xcc_index}}", "range": false, "refId": "A1", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "topk(5, ${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "{{hostname}} | {{gpu_id}} | {{xcc_index}}", "range": false, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "topk(5, ${g_metrics_prefix}gpu_gfx_busy_instantaneous{pod!=\"\", pod=\"$g_pod\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "{{hostname}} | {{gpu_id}} | {{xcc_index}}", "range": false, "refId": "B1", "useBackend": false } ], "title": "Top 5 GPU Usage", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Top 5 GPUs with highest VRAM used, labeled with hostname and GPU ID.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "targetBlank": true, "title": "Go to GPU Dashboard", "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__field.labels.gpu_uuid}&var-g_hostname=${__field.labels.hostname}&var-g_gpu_id=${__field.labels.gpu_id}" } ], "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "decmbytes" }, "overrides": [] }, "gridPos": { "h": 12, "w": 4, "x": 20, "y": 0 }, "id": 17, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "max" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "topk(5, ${g_metrics_prefix}gpu_used_vram{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": true, "legendFormat": "{{hostname}} | {{gpu_id}} | {{xcc_index}}", "range": false, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "exemplar": false, "expr": "topk(5, ${g_metrics_prefix}gpu_used_vram{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "{{hostname}} | {{gpu_id}} | {{xcc_index}}", "range": false, "refId": "B", "useBackend": false } ], "title": "Top 5 Used VRAM", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Total current PCIe bandwidth", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "MBs" }, "overrides": [] }, "gridPos": { "h": 3, "w": 4, "x": 8, "y": 3 }, "id": 13, "maxDataPoints": 60, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "exemplar": false, "expr": "sum(${g_metrics_prefix}pcie_bandwidth{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "PCIe Bandwidth", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "sum(${g_metrics_prefix}pcie_bandwidth{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "PCIe Bandwidth", "range": true, "refId": "B", "useBackend": false } ], "title": "Total Current PCIe Bandwidth", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "\\# of compute nodes used by the job", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "targetBlank": true, "title": "Go to Compute Node Dashboard", "url": "/d/de1q9vq97fe2oc/compute-node" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 0, "y": 4 }, "id": 11, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(group by(hostname) (${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"}))", "instant": false, "legendFormat": "# of compute nodes used", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(group by(hostname) (${g_metrics_prefix}gpu_gfx_busy_instantaneous{job_id!=\"\", job_id=\"$g_job_id\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"}))", "hide": false, "instant": false, "legendFormat": "# of compute nodes used", "range": true, "refId": "A1" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(group by(hostname) (${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"}))", "hide": false, "instant": false, "legendFormat": "# of compute nodes used", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(group by(hostname) (${g_metrics_prefix}gpu_gfx_busy_instantaneous{pod!=\"\", pod=\"$g_pod\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"}))", "hide": false, "instant": false, "legendFormat": "# of compute nodes used", "range": true, "refId": "B1" } ], "title": "Compute Nodes", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Total package power usage, in Watts", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 4, "w": 5, "x": 3, "y": 4 }, "id": 9, "maxDataPoints": 60, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_average_package_power{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "Average Job Package Power Usage", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_package_power{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "Job Package Power Usage", "range": true, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_average_package_power{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "Average Pod Package Power Usage", "range": true, "refId": "C", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_package_power{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "Pod Package Power Usage", "range": true, "refId": "D", "useBackend": false }, { "datasource": { "type": "__expr__", "uid": "${DS_EXPRESSION}" }, "expression": "$A+$B", "hide": false, "refId": "Total Job Package Power Usage", "type": "math" }, { "datasource": { "type": "__expr__", "uid": "${DS_EXPRESSION}" }, "expression": "$C+$D", "hide": false, "refId": "Total Pod Package Power Usage", "type": "math" } ], "title": "Total Power Usage", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Total PCIe bandwidth over time", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "displayName": "Total PCIe Bandwidth", "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "MBs" }, "overrides": [] }, "gridPos": { "h": 6, "w": 8, "x": 8, "y": 6 }, "id": 14, "maxDataPoints": 60, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}pcie_bandwidth{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}pcie_bandwidth{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "B", "useBackend": false } ], "title": "Total PCIe Bandwidth", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "\\# of GPUs allocated by the job", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "targetBlank": true, "title": "Go to GPU Dashboard", "url": "/d/ae0aj8euc43r4b/gpu" } ], "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byFrameRefID", "options": "Unhealthy (jobs)" }, "properties": [ { "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } } ] }, { "matcher": { "id": "byFrameRefID", "options": "Unhealthy (pods)" }, "properties": [ { "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 4, "w": 3, "x": 0, "y": 8 }, "id": 3, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 9 }, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"}))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "Allocated by Jobs", "range": false, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_gfx_busy_instantaneous{job_id!=\"\", job_id=\"$g_job_id\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"}))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "Allocated by Jobs", "range": false, "refId": "A1", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"}))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "Allocated by Pods", "range": false, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_gfx_busy_instantaneous{pod!=\"\", pod=\"$g_pod\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"}))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "Allocated by Pods", "range": false, "refId": "B1", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": false, "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"} > 0))", "hide": false, "instant": true, "legendFormat": "Busy GPUs", "range": false, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": false, "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_busy_instantaneous{job_id!=\"\", job_id=\"$g_job_id\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"} > 0))", "hide": false, "instant": true, "legendFormat": "Busy GPUs", "range": false, "refId": "C1" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": false, "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"} > 0))", "hide": false, "instant": true, "legendFormat": "Busy GPUs", "range": false, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": false, "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_busy_instantaneous{pod!=\"\", pod=\"$g_pod\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"} > 0))", "hide": false, "instant": true, "legendFormat": "Busy GPUs", "range": false, "refId": "D1" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": false, "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_health{job_id!=\"\", job_id=\"$g_job_id\"} < 1))", "hide": false, "instant": true, "legendFormat": "Unhealthy GPUs", "range": false, "refId": "Unhealthy (jobs)" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": false, "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_health{pod!=\"\", pod=\"$g_pod\"} < 1))", "hide": false, "instant": true, "legendFormat": "Unhealthy GPUs", "range": false, "refId": "Unhealthy (pods)" } ], "title": "Allocated GPUs", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 5, "x": 3, "y": 8 }, "id": 18, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "exemplar": false, "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_correct_total{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", "fullMetaSearch": false, "includeNullMetadata": true, "instant": true, "legendFormat": "Correctable", "range": false, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "exemplar": false, "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_correct_total{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "Correctable", "range": false, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "exemplar": false, "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_uncorrect_total{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "Uncorrectable", "range": false, "refId": "C", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", "exemplar": false, "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_uncorrect_total{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "Uncorrectable", "range": false, "refId": "D", "useBackend": false } ], "title": "Total ECC Counts", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Average of current GFX activity", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "displayName": "GPU Usage", "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 8, "x": 0, "y": 12 }, "id": 1, "maxDataPoints": 60, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\", card_model=~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_gfx_busy_instantaneous{job_id!=\"\", job_id=\"$g_job_id\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "C", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_gfx_busy_instantaneous{pod!=\"\", pod=\"$g_pod\", card_model!~\"102-D65208-0C|102-D67305-00|102-D65209-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "D", "useBackend": false } ], "title": "Average GPU Usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "% of used VRAM across GPUs the job or pod is running on", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 8, "x": 8, "y": 12 }, "id": 5, "maxDataPoints": 60, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_used_vram{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_total_vram{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_used_vram{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "C", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum(${g_metrics_prefix}gpu_total_vram{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": true, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "D", "useBackend": false }, { "datasource": { "type": "__expr__", "uid": "${DS_EXPRESSION}" }, "expression": "$A/$B", "hide": false, "refId": "Job Memory Usage", "type": "math" }, { "datasource": { "type": "__expr__", "uid": "${DS_EXPRESSION}" }, "expression": "$C/$D", "hide": false, "refId": "Pod Memory Usage", "type": "math" } ], "title": "Used VRAM", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Average GPU package power", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "displayName": "GPU Power", "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 6, "w": 8, "x": 16, "y": 12 }, "id": 7, "maxDataPoints": 60, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_package_power{job_id!=\"\", job_id=\"$g_job_id\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_package_power{pod!=\"\", pod=\"$g_pod\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_average_package_power{job_id!=\"\", job_id=\"$g_job_id\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", "hide": false, "instant": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_average_package_power{pod!=\"\", pod=\"$g_pod\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", "hide": false, "instant": false, "legendFormat": "__auto", "range": true, "refId": "D" } ], "title": "Average GPU Power", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Average of temperature of currently used GPUs", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "displayName": "GPU Temperature", "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 6, "w": 8, "x": 0, "y": 18 }, "id": 4, "maxDataPoints": 60, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{job_id!=\"\", job_id=\"$g_job_id\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{pod!=\"\", pod=\"$g_pod\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{job_id!=\"\", job_id=\"$g_job_id\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "C", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{pod!=\"\", pod=\"$g_pod\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "D", "useBackend": false } ], "title": "Average GPU Temperature", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Average memory temperature of currently used GPUs", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "displayName": "Memory Temperature", "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 6, "w": 8, "x": 8, "y": 18 }, "id": 6, "maxDataPoints": 60, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_memory_temperature{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_memory_temperature{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "B", "useBackend": false } ], "title": "Average Memory Temperature", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Averages of current temperatures across GPUs used by the job or pod, in Celsius:\n- 4 HBM temperatures\n- Edge temperature\n- Junction/hotspot temperature", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 8, "x": 16, "y": 18 }, "id": 8, "maxDataPoints": 60, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_hbm_temperature{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "HBM Temperature", "range": true, "refId": "A", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "Edge Temperature", "range": true, "refId": "B", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{job_id!=\"\", job_id=\"$g_job_id\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "Junction Temperature", "range": true, "refId": "C", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_hbm_temperature{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "HBM Temperature", "range": true, "refId": "D", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "Edge Temperature", "range": true, "refId": "E", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "builder", "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{pod!=\"\", pod=\"$g_pod\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "Junction Temperature", "range": true, "refId": "F", "useBackend": false } ], "title": "Average Sensor Temperatures", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "List of all GPUs used by this job. Health is the last known status of the GPU during a running job in the selected time range.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "filterable": true, "inspect": false }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "HOSTNAME" }, "properties": [ { "id": "links", "value": [ { "targetBlank": true, "title": "Go to Compute Node Dashboard", "url": "/d/de1q9vq97fe2oc/compute-node?var-g_hostname=${__data.fields.HOSTNAME}" } ] } ] }, { "matcher": { "id": "byName", "options": "GPU ID" }, "properties": [ { "id": "custom.width", "value": 100 }, { "id": "links", "value": [ { "targetBlank": true, "title": "Go to GPU Dashboard", "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__data.fields[\"gpu_uuid\"]}&var-g_hostname=${__data.fields.HOSTNAME}&var-g_gpu_id=${__data.fields[\"GPU ID\"]}" } ] } ] }, { "matcher": { "id": "byName", "options": "GPU UUID" }, "properties": [ { "id": "custom.width", "value": 320 }, { "id": "links", "value": [ { "targetBlank": true, "title": "Go to GPU Dashboard", "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__data.fields[\"gpu_uuid\"]}&var-g_hostname=${__data.fields.HOSTNAME}&var-g_gpu_id=${__data.fields[\"GPU ID\"]}" } ] } ] }, { "matcher": { "id": "byName", "options": "HEALTH" }, "properties": [ { "id": "custom.width", "value": 105 }, { "id": "mappings", "value": [ { "options": { "0": { "color": "red", "index": 0, "text": "unhealthy" }, "1": { "color": "green", "index": 1, "text": "healthy" } }, "type": "value" } ] }, { "id": "custom.cellOptions", "value": { "type": "color-text" } } ] } ] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "id": 19, "options": { "cellHeight": "sm", "footer": { "countRows": false, "enablePagination": true, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": false, "displayName": "HEALTH" }, { "desc": false, "displayName": "HOSTNAME" }, { "desc": false, "displayName": "GPU ID" } ] }, "pluginVersion": "11.2.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "${g_metrics_prefix}gpu_health{job_id!=\"\", job_id=\"$g_job_id\"} or vector(0)", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "${g_metrics_prefix}gpu_health{pod!=\"\", pod=\"$g_pod\"} or vector(0)", "hide": false, "instant": false, "legendFormat": "__auto", "range": true, "refId": "B" } ], "title": "All GPUs", "transformations": [ { "id": "reduce", "options": { "labelsToFields": true, "reducers": [ "last" ] } }, { "id": "groupBy", "options": { "fields": { "Last": { "aggregations": [ "last" ], "operation": "aggregate" }, "gpu_id": { "aggregations": [ "last" ], "operation": "aggregate" }, "gpu_uuid": { "aggregations": [], "operation": "groupby" }, "hostname": { "aggregations": [ "last" ], "operation": "aggregate" } } } }, { "disabled": true, "id": "filterFieldsByName", "options": {} }, { "id": "organize", "options": { "excludeByName": {}, "includeByName": {}, "indexByName": { "gpu_id (last)": 1, "gpu_uuid": 2, "hostname (last)": 0 }, "renameByName": { "Last (last)": "HEALTH", "gpu_id (last)": "GPU ID", "gpu_uuid": "GPU UUID", "hostname (last)": "HOSTNAME" } } }, { "id": "filterByValue", "options": { "filters": [ { "config": { "id": "isNull", "options": {} }, "fieldName": "GPU UUID" } ], "match": "any", "type": "exclude" } } ], "type": "table" }, { "gridPos": { "h": 2, "w": 2, "x": 22, "y": 24 }, "id": 20, "options": { "code": { "language": "plaintext", "showLineNumbers": false, "showMiniMap": false }, "content": "v1.3.1", "mode": "markdown" }, "pluginVersion": "11.2.2", "title": "Version", "type": "text" } ], "schemaVersion": 39, "tags": [], "templating": { "list": [ { "current": { "selected": false, "text": "", "value": "" }, "description": "string to prefix names of metrics queries (e.g. gpu_gfx_activity -> amd_gpu_gfx_activity)", "hide": 0, "label": "Metrics Prefix", "name": "g_metrics_prefix", "options": [ { "selected": true, "text": "", "value": "" } ], "query": "", "skipUrlSync": false, "type": "textbox" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "definition": "label_values(cluster_name)", "hide": 0, "includeAll": false, "label": "Cluster", "multi": false, "name": "g_cluster_name", "options": [], "query": { "qryType": 1, "query": "label_values(cluster_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 7, "type": "query" }, { "allValue": "+", "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "definition": "label_values({cluster_name=\"$g_cluster_name\"},job_id)", "hide": 0, "includeAll": false, "label": "Job", "multi": false, "name": "g_job_id", "options": [], "query": { "qryType": 1, "query": "label_values({cluster_name=\"$g_cluster_name\"},job_id)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 7, "type": "query" }, { "allValue": "+", "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "definition": "label_values({cluster_name=\"$g_cluster_name\"},pod)", "hide": 0, "includeAll": false, "label": "Pod", "multi": false, "name": "g_pod", "options": [], "query": { "qryType": 1, "query": "label_values({cluster_name=\"$g_cluster_name\"},pod)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 7, "type": "query" } ] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "Job", "uid": "ce1x81pyv3dvkb", "version": 1, "weekStart": "" }