# Monitors data from NVIDIA's DCGM exporter. # Depending on your configuration, you might have to modify labels if you have customized them # from the defaults. apiVersion: metrics.turbonomic.io/v1alpha1 kind: PrometheusQueryMapping metadata: name: nvidia-dcgm-exporter labels: mapping: nvidia-dcgm-exporter spec: entities: - type: nvidiaGPU attributes: - label: exported_container name: container - label: modelName # label is kept in addition to labels for backward compatibility labels: - modelName - GPU_I_PROFILE delimeter: "-MIG-" name: gpuModel - label: gpu name: gpuNum - isIdentifier: true label: UUID # label is kept in addition to labels for backward compatibility labels: - UUID - GPU_I_ID name: id - label: exported_namespace name: namespace - as: $hostIP label: instance matches: (?P\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})??) name: nodeIp - label: Hostname name: nodeName - label: exported_pod name: pod - as: $deployment label: exported_pod matches: ^(.*)-[a-f0-9]{10}-[a-z0-9]{5}$|(.*)-[a-f0-9]{0,10}[a-z0-9]{5}$ name: deploymentName metrics: - type: gpu queries: - type: used # This metric is equivalent to "DCGM_FI_DEV_GPU_UTIL" which however doesn't work for MIG, while # "DCGM_FI_PROF_GR_ENGINE_ACTIVE" works for both MIG and whole GPU. # If for some reason, "DCGM_FI_PROF_GR_ENGINE_ACTIVE" is not available and "DCGM_FI_DEV_GPU_UTIL" is available then # replace "DCGM_FI_PROF_GR_ENGINE_ACTIVE{}[DURATION]" with "DCGM_FI_DEV_GPU_UTIL{}[DURATION]/100", # as the two metrics are off by factor of 100. promql: avg_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{}[10m]) - type: gpuMem queries: - type: used # This query expression exhibits the "X OR Y - Z" pattern, because out of the "total", the "used" and the # "free", only two are available, and we don't know which two. Therefore, if the "used" isn't available, then # we go for "total" - "free". promql: (avg_over_time(DCGM_FI_DEV_FB_USED[10m]) OR avg_over_time(DCGM_FI_DEV_FB_TOTAL[10m]) - avg_over_time(DCGM_FI_DEV_FB_FREE[10m])) / 1024 - type: capacity # Capacity should be a constant; so there's no need to compute using moving averages. promql: (DCGM_FI_DEV_FB_TOTAL OR DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE) / 1024