{ "__inputs": [ { "name": "DS_PROMETHEUS", "label": "Prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__elements": {}, "__requires": [ { "type": "panel", "id": "gauge", "name": "Gauge", "version": "" }, { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "11.1.3" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "stat", "name": "Stat", "version": "" }, { "type": "panel", "id": "state-timeline", "name": "State timeline", "version": "" }, { "type": "panel", "id": "table", "name": "Table", "version": "" }, { "type": "panel", "id": "timeseries", "name": "Time series", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": null, "links": [], "liveNow": false, "panels": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Reported by vast api. ", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "currencyUSD" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 0, "y": 0 }, "id": 1, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_account_balance", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Account Balance", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Reported by vast api. ", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "currencyUSD" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 2, "y": 0 }, "id": 42, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_current_total", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Pending Payout", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 50 }, { "color": "yellow", "value": 80 }, { "color": "blue", "value": 90 }, { "color": "green", "value": 100 } ] }, "unit": "%" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 4, "y": 0 }, "id": 8, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vast_machine_Verification)/ sum(vast_machine_hostname) *100", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Machines verfied", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 6, "y": 0 }, "id": 9, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "count(vast_machine_hostname)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Total Machines", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 30 }, { "color": "yellow", "value": 50 }, { "color": "blue", "value": 70 }, { "color": "green", "value": 100 } ] }, "unit": "%" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 8, "y": 0 }, "id": 40, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vastai_machine_gpu_rented_bid_demand) / sum(vast_machine_gpu_name) * 100", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "GPU Used Bid", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 30 }, { "color": "yellow", "value": 50 }, { "color": "blue", "value": 70 }, { "color": "green", "value": 100 } ] }, "unit": "%" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 10, "y": 0 }, "id": 4, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vastai_machine_gpu_rented_on_demand) / sum(vast_machine_gpu_name) * 100", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "GPU Used On-demand", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "This shows how many of the GPUs are detected by the systems vs what each machine has registered on vast. ", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 100 } ] }, "unit": "%" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 12, "y": 0 }, "id": 5, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "count(DCGM_FI_DEV_GPU_UTIL{})/sum(vast_machine_gpu_name) * 100", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Actual vs Vast registred", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Reported by vast api. ", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "currencyUSD" }, "overrides": [] }, "gridPos": { "h": 8, "w": 10, "x": 14, "y": 0 }, "id": 19, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vastai_machine_earn_hour) by (hostname)", "legendFormat": "{{hostname}}", "range": true, "refId": "A" } ], "title": "Current Machine earnings per day", "transformations": [ { "filter": { "id": "byRefId", "options": "" }, "id": "filterByValue", "options": { "filters": [], "match": "any", "type": "include" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Reported by vast for all machines", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.1 } ] }, "unit": "currencyUSD" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 0, "y": 8 }, "id": 21, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vastai_machine_earn_hour)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Reported Income per day", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Based on incress of the pending balance over the laste hour", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 0.1 } ] }, "unit": "currencyUSD" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 2, "y": 8 }, "id": 12, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "deriv(vastai_current_total[1h]) * 3600", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Current Income per hour", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 50 }, { "color": "yellow", "value": 80 }, { "color": "blue", "value": 90 }, { "color": "green", "value": 100 } ] }, "unit": "%" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 4, "y": 8 }, "id": 6, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vast_machine_Listed)/ sum(vast_machine_hostname) *100", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Machines Listed", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 6, "y": 8 }, "id": 3, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vast_machine_gpu_name)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Toal GPU's", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 30 }, { "color": "yellow", "value": 50 }, { "color": "blue", "value": 70 }, { "color": "green", "value": 100 } ] }, "unit": "%" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 8, "y": 8 }, "id": 22, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vastai_machine_gpu_rented_on_reserved) / sum(vast_machine_gpu_name) * 100", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "GPU Used Reserved", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "blue", "value": 20 }, { "color": "yellow", "value": 40 }, { "color": "orange", "value": 60 }, { "color": "red", "value": 80 } ] }, "unit": "%" }, "overrides": [] }, "gridPos": { "h": 8, "w": 2, "x": 10, "y": 8 }, "id": 7, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vastai_machine_gpu_idle) / sum(vast_machine_gpu_name) * 100", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "GPU's Idle", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Avraged over 24h as reported by vast api. ", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 27, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "always", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "%" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "id": 27, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum(vastai_machine_gpu_rented_on_demand) by (hostname)) / (sum(vast_machine_gpu_name) by (hostname) )* 100", "hide": true, "legendFormat": "{{machine_id}}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "avg_over_time(\r\n (\r\n sum(vastai_machine_gpu_rented_on_demand) by (hostname) / \r\n sum(vast_machine_gpu_name) by (hostname)\r\n )[10m:15s]\r\n) * 100\r\n", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" } ], "title": "Machine usage On-demand", "transformations": [ { "filter": { "id": "byRefId", "options": "" }, "id": "filterByValue", "options": { "filters": [], "match": "any", "type": "include" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Total amount of power used by systems", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "#6ED0E0", "value": 1000 }, { "color": "#EAB839", "value": 2000 }, { "color": "#EF843C", "value": 3000 }, { "color": "dark-red", "value": 4000 } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 9, "w": 4, "x": 0, "y": 16 }, "hideTimeOverride": false, "id": 41, "interval": "15s", "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(\r\n sum by (job) (\r\n $BasePower + $CPU_Max_Power * \r\n irate(node_cpu_seconds_total{mode=\"system\"}[$__rate_interval])\r\n ) / sum by (job) (\r\n (irate(node_cpu_seconds_total{}[$__rate_interval]))\r\n ) \r\n) + sum(\r\n sum by (job) (DCGM_FI_DEV_POWER_USAGE)\r\n) \r\n", "format": "table", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Total Power", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": true, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 9, "w": 20, "x": 4, "y": 16 }, "id": 26, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum by (job) (irate(node_cpu_seconds_total{mode=\"system\"}[$__rate_interval])) / on(job) group_left sum by (job)((irate(node_cpu_seconds_total{}[$__rate_interval]))) * $CPU_Max_Power * count by (job)(node_cpu_temperature{package=~\"[0-9]+\"}) + sum by (job) (DCGM_FI_DEV_POWER_USAGE) + $BasePower", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Machines Power usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "As reported by vast api.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false, "minWidth": 50, "width": 65 }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Stored Jobs" }, "properties": [ { "id": "custom.width", "value": 90 } ] }, { "matcher": { "id": "byName", "options": "GPU's" }, "properties": [ { "id": "custom.width", "value": 57 } ] }, { "matcher": { "id": "byName", "options": "hostname" }, "properties": [ { "id": "custom.width", "value": 85 } ] }, { "matcher": { "id": "byName", "options": "Idle" }, "properties": [ { "id": "custom.width", "value": 50 } ] }, { "matcher": { "id": "byName", "options": "Run I" }, "properties": [ { "id": "custom.width", "value": 50 } ] }, { "matcher": { "id": "byName", "options": "Run D" }, "properties": [ { "id": "custom.width", "value": 55 } ] }, { "matcher": { "id": "byName", "options": "gpu_name" }, "properties": [ { "id": "custom.width", "value": 102 } ] }, { "matcher": { "id": "byName", "options": "Machine" }, "properties": [ { "id": "custom.width", "value": 80 } ] }, { "matcher": { "id": "byName", "options": "Listed" }, "properties": [ { "id": "custom.width", "value": 65 } ] }, { "matcher": { "id": "byName", "options": "Avail Space(GB)" }, "properties": [ { "id": "custom.width", "value": 125 } ] }, { "matcher": { "id": "byName", "options": "Value #H" }, "properties": [ { "id": "custom.width", "value": 60 } ] }, { "matcher": { "id": "byName", "options": "Value #I" }, "properties": [ { "id": "custom.width", "value": 65 } ] }, { "matcher": { "id": "byName", "options": "machine_id" }, "properties": [ { "id": "custom.width", "value": 97 } ] }, { "matcher": { "id": "byName", "options": "__name__" }, "properties": [ { "id": "custom.width", "value": 211 } ] }, { "matcher": { "id": "byName", "options": "End Date" }, "properties": [ { "id": "custom.width", "value": 159 } ] }, { "matcher": { "id": "byName", "options": "End Date" }, "properties": [ { "id": "custom.width", "value": 150 }, { "id": "unit", "value": "dateTimeAsLocal" } ] }, { "matcher": { "id": "byName", "options": "End Date * 1" }, "properties": [ { "id": "custom.width", "value": 150 } ] } ] }, "gridPos": { "h": 8, "w": 13, "x": 0, "y": 25 }, "id": 11, "options": { "cellHeight": "sm", "footer": { "countRows": false, "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true, "sortBy": [ { "desc": false, "displayName": "hostname" } ] }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vast_machine_gpu_name", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vast_machine_hostname", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_current_rentals_resident", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_idle", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_rented_bid_demand", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_rented_on_reserved", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "K" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_rented_on_demand", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_avail_disk_space", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vast_machine_Listed", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vast_machine_Verification", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "I" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "APT_UPGRADABLE_PACKAGES", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "J" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "exemplar": false, "expr": "vastai_machine_end_date", "format": "table", "hide": false, "instant": false, "legendFormat": "__auto", "range": true, "refId": "L" } ], "timeFrom": "5s", "title": "Machine_Info", "transformations": [ { "id": "joinByField", "options": { "byField": "hostname", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 10": false, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "Time 9": true, "Value #A": false, "Value #B": true, "__name__ 1": true, "__name__ 10": true, "__name__ 11": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "__name__ 9": true, "hostname 2": true, "hostname 3": true, "hostname 4": true, "hostname 5": true, "hostname 6": true, "hostname 7": true, "hostname 8": true, "hostname 9": true, "instance": true, "instance 1": true, "instance 10": true, "instance 11": false, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "instance 9": true, "job": true, "job 1": true, "job 10": true, "job 11": false, "job 2": true, "job 3": true, "job 4": true, "job 5": true, "job 6": true, "job 7": true, "job 8": true, "job 9": true, "machine_id 10": true, "machine_id 11": true, "machine_id 2": true, "machine_id 3": true, "machine_id 4": true, "machine_id 5": true, "machine_id 6": true, "machine_id 7": true, "machine_id 8": true, "machine_id 9": true }, "includeByName": {}, "indexByName": { "Time 1": 46, "Time 10": 57, "Time 2": 4, "Time 3": 9, "Time 4": 14, "Time 5": 19, "Time 6": 24, "Time 7": 30, "Time 8": 34, "Time 9": 39, "Value #A": 45, "Value #B": 8, "Value #C": 13, "Value #D": 18, "Value #E": 23, "Value #F": 29, "Value #G": 44, "Value #H": 38, "Value #I": 43, "Value #K": 28, "__name__ 1": 3, "__name__ 10": 58, "__name__ 2": 5, "__name__ 3": 10, "__name__ 4": 15, "__name__ 5": 20, "__name__ 6": 25, "__name__ 7": 31, "__name__ 8": 35, "__name__ 9": 40, "gpu_name": 2, "hostname": 0, "instance 1": 47, "instance 10": 59, "instance 2": 6, "instance 3": 11, "instance 4": 16, "instance 5": 21, "instance 6": 26, "instance 7": 32, "instance 8": 36, "instance 9": 41, "job 1": 48, "job 10": 60, "job 2": 7, "job 3": 12, "job 4": 17, "job 5": 22, "job 6": 27, "job 7": 33, "job 8": 37, "job 9": 42, "machine_id 1": 1, "machine_id 10": 61, "machine_id 2": 49, "machine_id 3": 50, "machine_id 4": 51, "machine_id 5": 52, "machine_id 6": 53, "machine_id 7": 54, "machine_id 8": 55, "machine_id 9": 56 }, "renameByName": { "Time 10": "", "Value #A": "GPU's", "Value #C": "Stored Jobs", "Value #D": "Idle", "Value #E": "Run I", "Value #F": "Run D", "Value #G": "Avail Space(GB)", "Value #H": "Listed", "Value #I": "Verifyd", "Value #K": "Run R", "Value #L": "End Date", "instance 6": "", "machine_id": "Machine", "machine_id 11": "" } } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false, "minWidth": 50 }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "GPU Count" }, "properties": [ { "id": "custom.width", "value": 78 } ] }, { "matcher": { "id": "byName", "options": "GPU Model Name" }, "properties": [ { "id": "custom.width", "value": 202 } ] }, { "matcher": { "id": "byName", "options": "Hostname" }, "properties": [ { "id": "custom.width", "value": 103 } ] } ] }, "gridPos": { "h": 8, "w": 8, "x": 13, "y": 25 }, "id": 29, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [] }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "count(DCGM_FI_DEV_FB_USED) by (Hostname)", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_SM_CLOCK{Hostname=~\".+\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" } ], "timeFrom": "30s", "title": "Reported GPU Count", "transformations": [ { "id": "joinByField", "options": { "byField": "Hostname", "mode": "outer" } }, { "id": "groupBy", "options": { "fields": { "DCGM_FI_DRIVER_VERSION": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Hostname": { "aggregations": [], "operation": "groupby" }, "Value": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Value #A": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "Value #B": { "aggregations": [ "lastNotNull" ] }, "modelName": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" } } } }, { "id": "organize", "options": { "excludeByName": {}, "includeByName": {}, "indexByName": { "DCGM_FI_DRIVER_VERSION (lastNotNull)": 3, "Hostname": 0, "Value #A (lastNotNull)": 1, "modelName (lastNotNull)": 2 }, "renameByName": { "DCGM_FI_DRIVER_VERSION (lastNotNull)": "DCGM_FI_DRIVER_VERSION", "Hostname": "", "Value #A (lastNotNull)": "GPU Count", "Value (lastNotNull)": "GPU COUNT", "modelName (lastNotNull)": "GPU Model Name" } } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false, "minWidth": 50 }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Hostname" }, "properties": [ { "id": "custom.width", "value": 105 } ] } ] }, "gridPos": { "h": 8, "w": 3, "x": 21, "y": 25 }, "id": 38, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": [ "Value" ], "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [] }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "APT_UPGRADABLE_PACKAGES", "format": "table", "instant": false, "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Pending Updates", "transformations": [ { "id": "groupBy", "options": { "fields": { "Value": { "aggregations": [ "lastNotNull" ], "operation": "aggregate" }, "job": { "aggregations": [], "operation": "groupby" } } } }, { "id": "organize", "options": { "excludeByName": {}, "includeByName": {}, "indexByName": {}, "renameByName": { "Value (lastNotNull)": "Number", "job": "Hostname" } } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false, "minWidth": 50 }, "decimals": 1, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 33 }, "id": 39, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 4, "showHeader": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum by(instance) (irate(node_cpu_seconds_total{mode!=\"idle\"}[$__rate_interval])) / on(instance) group_left sum by (instance) (irate(node_cpu_seconds_total[$__rate_interval]))) * 100\r\n", "format": "table", "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "node_uname_info", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum by(instance) (node_memory_MemTotal_bytes{} - node_memory_MemFree_bytes{} - node_memory_Buffers_bytes{} - node_memory_Cached_bytes{})) / sum by(instance) (node_memory_MemTotal_bytes{}) * 100\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(node_time_seconds - on(instance) node_boot_time_seconds ) / 60 /60", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "round(max by(instance)(rate(node_network_transmit_bytes_total[1m])*8/1000000), 0.1)\r\n\r\n\r\n\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "round(max by(instance)(rate(node_network_receive_bytes_total[1m])*8/1000000), 0.1)\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(1 - avg by(instance)(node_filesystem_avail_bytes{mountpoint=\"/\"}/node_filesystem_size_bytes{mountpoint=\"/\"})) * 100\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(1 - avg by(instance)(node_filesystem_avail_bytes{mountpoint=\"/var/lib/docker\"}/node_filesystem_size_bytes{mountpoint=\"/var/lib/docker\"})) * 100\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "avg by(instance)(node_cpu_temperature{})", "format": "table", "hide": false, "instant": false, "legendFormat": "__auto", "range": true, "refId": "I" } ], "title": "Machine Overview", "transformations": [ { "id": "joinByField", "options": { "byField": "instance", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "Time 9": true, "Value #B": true, "__name__": true, "domainname": true, "instance": true, "job": true, "machine": true, "sysname": true, "version": true }, "indexByName": { "Time 1": 4, "Time 2": 7, "Time 3": 14, "Time 4": 16, "Time 5": 18, "Time 6": 20, "Time 7": 22, "Time 8": 24, "Time 9": 26, "Value #A": 5, "Value #B": 13, "Value #C": 15, "Value #D": 17, "Value #E": 19, "Value #F": 21, "Value #G": 23, "Value #H": 25, "Value #I": 6, "__name__": 8, "domainname": 9, "instance": 3, "job": 10, "machine": 2, "nodename": 0, "release": 1, "sysname": 11, "version": 12 }, "renameByName": { "Time 2": "", "Time 5": "", "Value #A": "CPU %", "Value #C": "Memory Used %", "Value #D": "Uptime(h)", "Value #E": "Network_transmit Mbps", "Value #F": "Network_receive Mbps ", "Value #G": "Root FS Disk Usage %", "Value #H": "Docker Disk Usage %", "Value #I": "CPU Temp C", "nodename": "Machine", "release": "Kernel" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "transparent", "index": 0, "text": "Idle" }, "to": 0 }, "type": "range" }, { "options": { "from": 1, "result": { "color": "blue", "index": 1, "text": "I" }, "to": 1 }, "type": "range" }, { "options": { "from": 2, "result": { "color": "green", "index": 2, "text": "D" }, "to": 2 }, "type": "range" }, { "options": { "from": 3, "result": { "color": "orange", "index": 3, "text": "R" }, "to": 3 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent", "value": null } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "gpu 1" }, "properties": [ { "id": "custom.width", "value": 133 } ] } ] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }, "id": 37, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true, "sortBy": [] }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_occupancy{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_occupancy{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_occupancy{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_occupancy{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_occupancy{gpu=\"4\"} ", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_occupancy{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_occupancy{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "vastai_machine_gpu_occupancy{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "timeFrom": "5s", "title": "Machine GPU Occupancy", "transformations": [ { "id": "joinByField", "options": { "byField": "Hostname", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Value #A": false, "Value #F": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "job 1": true, "job 2": true, "job 3": true, "job 4": true, "job 5": true, "job 6": true, "machine_id 1": true, "machine_id 2": true, "machine_id 3": true, "machine_id 4": true, "machine_id 5": true, "machine_id 6": true }, "includeByName": {}, "indexByName": {}, "renameByName": { "Time 2": "", "Value #A": "GPU 1", "Value #B": "GPU 2", "Value #C": "GPU 3", "Value #D": "GPU 4", "Value #E": "GPU 5", "Value #F": "GPU 6", "gpu 2": "GPU 2", "gpu 5": "", "instance 6": "" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Hostname" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "As reported by vast api.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }, "id": 10, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vast_machine_Reliability) by (hostname)", "legendFormat": "{{hostname}}", "range": true, "refId": "A" } ], "title": "Machine Reliability", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "As reported by vast api.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "fillOpacity": 70, "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineWidth": 0, "spanNulls": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 1 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }, "id": 13, "options": { "alignValue": "left", "legend": { "displayMode": "list", "placement": "bottom", "showLegend": false }, "mergeValues": false, "rowHeight": 0.9, "showValue": "auto", "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "9.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum by(hostname) (vast_machine_timeout)", "legendFormat": "{{hostname}}", "range": true, "refId": "A" } ], "title": "Machine Online Status", "type": "state-timeline" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Current error state for GPU (2 for error, 1 for no error, 0 unknown state)", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "transparent", "index": 0, "text": "Unknown" }, "to": 0 }, "type": "range" }, { "options": { "from": 1, "result": { "color": "green", "index": 1, "text": "OK" }, "to": 1 }, "type": "range" }, { "options": { "from": 2, "result": { "color": "red", "index": 2, "text": "ERROR" }, "to": 2 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }, "id": 35, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_ERROR_STATE{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_ERROR_STATE{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_ERROR_STATE{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_ERROR_STATE{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_ERROR_STATE{gpu=\"4\"} ", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_ERROR_STATE{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_ERROR_STATE{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_ERROR_STATE{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "timeFrom": "30s", "title": "Machine GPU Error State", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true, "reason": true }, "includeByName": {}, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "As reported by vast api.", "fieldConfig": { "defaults": { "color": { "fixedColor": "green", "mode": "fixed" }, "custom": { "fillOpacity": 70, "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineWidth": 0, "spanNulls": false }, "mappings": [], "max": 10, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 10 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 56 }, "id": 20, "options": { "alignValue": "left", "legend": { "displayMode": "list", "placement": "bottom", "showLegend": false }, "mergeValues": false, "rowHeight": 0.9, "showValue": "auto", "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "9.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum by(hostname) (vastai_machine_ErrorDescription)", "format": "time_series", "legendFormat": "{{hostname}}", "range": true, "refId": "A" } ], "title": "Machine Reporting Error", "type": "state-timeline" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 50 }, "type": "range" }, { "options": { "from": 51, "result": { "color": "green", "index": 1 }, "to": 70 }, "type": "range" }, { "options": { "from": 71, "result": { "color": "yellow", "index": 2 }, "to": 80 }, "type": "range" }, { "options": { "from": 81, "result": { "color": "red", "index": 3 }, "to": 150 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 56 }, "id": 31, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "timeFrom": "30s", "title": "Machine GPU VRAM Temps ", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 50 }, "type": "range" }, { "options": { "from": 51, "result": { "color": "green", "index": 1 }, "to": 70 }, "type": "range" }, { "options": { "from": 71, "result": { "color": "yellow", "index": 2 }, "to": 80 }, "type": "range" }, { "options": { "from": 81, "result": { "color": "orange", "index": 3 }, "to": 90 }, "type": "range" }, { "options": { "from": 91, "result": { "color": "red", "index": 4 }, "to": 200 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 64 }, "id": 32, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "last_over_time(DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"0\"}[1m])", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "last_over_time(DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"1\"}[1m])\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "last_over_time(DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"2\"}[1m])", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "last_over_time(DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"3\"}[1m])", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "last_over_time(DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"4\"}[1m])", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "last_over_time(DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"5\"}[1m])", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "last_over_time(DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"6\"}[1m])", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "last_over_time(DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"7\"}[1m])", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "timeFrom": "30s", "title": "Machine GPU Core Hot Spot", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 50 }, "type": "range" }, { "options": { "from": 51, "result": { "color": "green", "index": 1 }, "to": 70 }, "type": "range" }, { "options": { "from": 71, "result": { "color": "yellow", "index": 2 }, "to": 80 }, "type": "range" }, { "options": { "from": 81, "result": { "color": "red", "index": 3 }, "to": 100 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" } ] }, "unit": "%" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 64 }, "id": 28, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "timeFrom": "30s", "title": "Machine GPU FAN Speeds", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "DCGM_FI_DRIVER_VERSION 5": true, "DCGM_FI_DRIVER_VERSION 6": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "includeByName": {}, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 50 }, "type": "range" }, { "options": { "from": 51, "result": { "color": "green", "index": 1 }, "to": 70 }, "type": "range" }, { "options": { "from": 71, "result": { "color": "yellow", "index": 2 }, "to": 80 }, "type": "range" }, { "options": { "from": 81, "result": { "color": "red", "index": 3 }, "to": 100 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 72 }, "id": 25, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "timeFrom": "30s", "title": "Machine GPU Temps", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "DCGM_FI_DRIVER_VERSION 5": true, "DCGM_FI_DRIVER_VERSION 6": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "includeByName": {}, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 0 }, "type": "range" }, { "options": { "from": 1, "result": { "color": "red", "index": 1 }, "to": 100 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" } ] }, "unit": "bool" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 72 }, "id": 33, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"4\"} ", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "timeFrom": "30s", "title": "Machine GPU Thermal Throttle", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true, "reason": true }, "includeByName": {}, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 100 }, "type": "range" }, { "options": { "from": 100, "result": { "color": "green", "index": 1 }, "to": 200 }, "type": "range" }, { "options": { "from": 200, "result": { "color": "yellow", "index": 2 }, "to": 300 }, "type": "range" }, { "options": { "from": 300, "result": { "color": "red", "index": 3 }, "to": 500 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 80 }, "id": 34, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "timeFrom": "30s", "title": "Machine GPU Power", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "DCGM_FI_DRIVER_VERSION 5": true, "DCGM_FI_DRIVER_VERSION 6": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "includeByName": {}, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Cont of AER in system logs per GPU: Advanced Error Reporting by PCI Express devices", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 1 }, "type": "range" }, { "options": { "from": 1, "result": { "color": "green", "index": 1 }, "to": 10 }, "type": "range" }, { "options": { "from": 10, "result": { "color": "yellow", "index": 2 }, "to": 100 }, "type": "range" }, { "options": { "from": 100, "result": { "color": "red", "index": 3 }, "to": 100000000000 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 80 }, "id": 36, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "11.1.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_AER_TOTAL_ERRORS{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_AER_TOTAL_ERRORS{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_AER_TOTAL_ERRORS{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_AER_TOTAL_ERRORS{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_AER_TOTAL_ERRORS{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_AER_TOTAL_ERRORS{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_AER_TOTAL_ERRORS{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "GPU_AER_TOTAL_ERRORSE{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "timeFrom": "30s", "title": "Machine GPU AER Errors", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "As reported by vast api for the day before. ", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "currencyUSD" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 88 }, "id": 15, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum(vastai_per_machine_gpu_earn) by (machine_id) ) !=0", "legendFormat": "{{machine_id}}", "range": true, "refId": "A" } ], "title": "Historic Machine earnings for GPUs per day", "transformations": [ { "filter": { "id": "byRefId", "options": "" }, "id": "filterByValue", "options": { "filters": [], "match": "any", "type": "include" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "As reported by vast api for the day before. ", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "currencyUSD" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 88 }, "id": 14, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum(vastai_per_machine_gpu_earn+vastai_per_machine_bwd_earn+vastai_per_machine_bwu_earn+vastai_per_machine_sto_earn) by (machine_id) ) !=0", "legendFormat": "{{machine_id}}", "range": true, "refId": "A" } ], "title": "Historic Machine earnings per day", "transformations": [ { "filter": { "id": "byRefId", "options": "" }, "id": "filterByValue", "options": { "filters": [], "match": "any", "type": "include" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "As reported by vast api.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "decmbytes" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 96 }, "id": 18, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum(vastai_machine_avail_disk_space) by (machine_id) ) !=0", "legendFormat": "{{machine_id}}", "range": true, "refId": "A" } ], "title": "Machine avail disk space reported by vast", "transformations": [ { "filter": { "id": "byRefId", "options": "" }, "id": "filterByValue", "options": { "filters": [], "match": "any", "type": "include" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "As reported by vast api for the day before. ", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "currencyUSD" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 96 }, "id": 16, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum(vastai_per_machine_sto_earn) by (machine_id) ) !=0", "legendFormat": "{{machine_id}}", "range": true, "refId": "A" } ], "title": "Historic Machine earnings storage per day", "transformations": [ { "filter": { "id": "byRefId", "options": "" }, "id": "filterByValue", "options": { "filters": [], "match": "any", "type": "include" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "As reported by vast api for the day before. ", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "currencyUSD" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 104 }, "id": 17, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum(vastai_per_machine_bwd_earn+vastai_per_machine_bwu_earn) by (machine_id) ) !=0", "legendFormat": "{{machine_id}}", "range": true, "refId": "A" } ], "title": "Historic Machine earnings Bandwith per day", "transformations": [ { "filter": { "id": "byRefId", "options": "" }, "id": "filterByValue", "options": { "filters": [], "match": "any", "type": "include" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Reported by vast api. ", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "MBs" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 104 }, "id": 24, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vastai_machine_InetUp) by (machine_id)", "legendFormat": "{{machine_id}}", "range": true, "refId": "A" } ], "title": "Machine Internet Upload speed", "transformations": [ { "filter": { "id": "byRefId", "options": "" }, "id": "filterByValue", "options": { "filters": [], "match": "any", "type": "include" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Reported by vast api. ", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "MBs" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 112 }, "id": 23, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "sum(vast_machine_InetDown) by (machine_id)", "legendFormat": "{{machine_id}}", "range": true, "refId": "A" } ], "title": "Machine Internet Down speed", "transformations": [ { "filter": { "id": "byRefId", "options": "" }, "id": "filterByValue", "options": { "filters": [], "match": "any", "type": "include" } } ], "type": "timeseries" } ], "refresh": "5s", "schemaVersion": 39, "tags": [ "Prometheus" ], "templating": { "list": [ { "current": { "selected": false, "text": "300", "value": "300" }, "hide": 0, "label": "CPU Max Power", "name": "CPU_Max_Power", "options": [ { "selected": true, "text": "300", "value": "300" } ], "query": "300", "skipUrlSync": false, "type": "textbox" }, { "current": { "selected": false, "text": "100", "value": "100" }, "hide": 0, "label": "BasePower", "name": "BasePower", "options": [ { "selected": true, "text": "100", "value": "100" } ], "query": "100", "skipUrlSync": false, "type": "textbox" } ] }, "time": { "from": "now-24h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Vast Dashboard", "uid": "d1e22d5b-dcf4-4f54-8629-972173cb24eb", "version": 139, "weekStart": "" }