# Default values for gpu-operator. # This is a YAML-formatted file. # Declare variables to be passed into your templates. platform: openshift: false nfd: enabled: true psp: enabled: false sandboxWorkloads: enabled: false defaultWorkload: "container" daemonsets: priorityClassName: system-node-critical tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule validator: repository: nvcr.io/nvidia/cloud-native image: gpu-operator-validator # If version is not specified, then default is to use chart.AppVersion #version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] args: [] resources: {} plugin: env: - name: WITH_WORKLOAD value: "true" operator: repository: nvcr.io/nvidia image: gpu-operator # If version is not specified, then default is to use chart.AppVersion #version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] priorityClassName: system-node-critical defaultRuntime: docker runtimeClass: nvidia use_ocp_driver_toolkit: false initContainer: image: cuda repository: nvcr.io/nvidia version: 11.6.0-base-ubi8 imagePullPolicy: IfNotPresent tolerations: - key: "node-role.kubernetes.io/master" operator: "Equal" value: "" effect: "NoSchedule" annotations: openshift.io/scc: restricted-readonly affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 1 preference: matchExpressions: - key: "node-role.kubernetes.io/master" operator: In values: [""] logging: timeEncoding: epoch resources: limits: cpu: 500m memory: 350Mi requests: cpu: 200m memory: 100Mi mig: strategy: single driver: enabled: true repository: nvcr.io/nvidia image: driver version: "515.48.07" imagePullPolicy: IfNotPresent imagePullSecrets: [] rdma: enabled: false useHostMofed: false manager: image: k8s-driver-manager repository: nvcr.io/nvidia/cloud-native version: v0.4.1 imagePullPolicy: IfNotPresent env: - name: ENABLE_AUTO_DRAIN value: "true" - name: DRAIN_USE_FORCE value: "false" - name: DRAIN_POD_SELECTOR_LABEL value: "" - name: DRAIN_TIMEOUT_SECONDS value: "0s" - name: DRAIN_DELETE_EMPTYDIR_DATA value: "false" env: [] resources: {} # Private mirror repository configuration repoConfig: configMapName: "" # custom ssl key/certificate configuration certConfig: name: "" # vGPU licensing configuration licensingConfig: configMapName: "" nlsEnabled: false # vGPU topology daemon configuration virtualTopology: config: "" # kernel module configuration for NVIDIA driver kernelModuleConfig: name: "" # configuration for controlling rolling update of NVIDIA driver DaemonSet pods rollingUpdate: # maximum number of nodes to simultaneously apply pod updates on. # can be specified either as number or percentage of nodes. Default 1. maxUnavailable: "1" toolkit: enabled: true repository: nvcr.io/nvidia/k8s image: container-toolkit version: v1.10.0-ubuntu20.04 imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] resources: {} devicePlugin: repository: nvcr.io/nvidia image: k8s-device-plugin version: v0.12.2-ubi8 imagePullPolicy: IfNotPresent imagePullSecrets: [] args: [] env: - name: PASS_DEVICE_SPECS value: "true" - name: FAIL_ON_INIT_ERROR value: "true" - name: DEVICE_LIST_STRATEGY value: envvar - name: DEVICE_ID_STRATEGY value: uuid - name: NVIDIA_VISIBLE_DEVICES value: all - name: NVIDIA_DRIVER_CAPABILITIES value: all resources: {} # Plugin configuration config: # ConfigMap name name: "" # Default config name within the ConfigMap default: "" # standalone dcgm hostengine dcgm: # disabled by default to use embedded nv-hostengine by exporter enabled: false repository: nvcr.io/nvidia/cloud-native image: dcgm version: 2.4.5-1-ubuntu20.04 imagePullPolicy: IfNotPresent hostPort: 5555 args: [] env: [] resources: {} dcgmExporter: repository: nvcr.io/nvidia/k8s image: dcgm-exporter version: 2.4.5-2.6.7-ubuntu20.04 imagePullPolicy: IfNotPresent env: - name: DCGM_EXPORTER_LISTEN value: ":9400" - name: DCGM_EXPORTER_KUBERNETES value: "true" - name: DCGM_EXPORTER_COLLECTORS value: "/etc/dcgm-exporter/dcp-metrics-included.csv" resources: {} gfd: repository: nvcr.io/nvidia image: gpu-feature-discovery version: v0.6.1-ubi8 imagePullPolicy: IfNotPresent imagePullSecrets: [] env: - name: GFD_SLEEP_INTERVAL value: 60s - name: GFD_FAIL_ON_INIT_ERROR value: "true" resources: {} migManager: enabled: true repository: nvcr.io/nvidia/cloud-native image: k8s-mig-manager version: v0.4.2-ubuntu20.04 imagePullPolicy: IfNotPresent imagePullSecrets: [] env: - name: WITH_REBOOT value: "false" resources: {} config: name: "" gpuClientsConfig: name: "" nodeStatusExporter: enabled: false repository: nvcr.io/nvidia/cloud-native image: gpu-operator-validator # If version is not specified, then default is to use chart.AppVersion #version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] resources: {} # Experimental and only deploys nvidia-fs driver on Ubuntu gds: enabled: false repository: nvcr.io/nvidia/cloud-native image: nvidia-fs version: "515.43.04" imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] args: [] vgpuManager: enabled: true repository: nvcr.io/nvidia image: vgpu-manager version: "510.73.06" imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] resources: {} driverManager: image: k8s-driver-manager repository: nvcr.io/nvidia/cloud-native version: v0.4.1 imagePullPolicy: IfNotPresent env: - name: ENABLE_AUTO_DRAIN value: "false" vgpuDeviceManager: enabled: true repository: nvcr.io/nvidia/cloud-native image: vgpu-device-manager version: "v0.1.0" imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] # NOTE: When sandboxedEnvironments is enabled, creating # a custom ConfigMap with vGPU device configuration is required # for a successful vGPU Device Manager deployent. Replace # the below with the correct ConfigMap and default config # in ConfigMap. config: name: "vgpu-devices-config" default: "default" vfioManager: enabled: true repository: nvcr.io/nvidia image: cuda version: 11.6.0-base-ubi8 imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] resources: {} driverManager: image: k8s-driver-manager repository: nvcr.io/nvidia/cloud-native version: v0.4.1 imagePullPolicy: IfNotPresent env: - name: ENABLE_AUTO_DRAIN value: "false" sandboxDevicePlugin: enabled: true repository: nvcr.io/nvidia image: kubevirt-gpu-device-plugin version: v1.1.2 imagePullPolicy: IfNotPresent imagePullSecrets: [] args: [] env: [] resources: {} node-feature-discovery: worker: tolerations: - key: "node-role.kubernetes.io/master" operator: "Equal" value: "" effect: "NoSchedule" - key: "nvidia.com/gpu" operator: "Equal" value: "present" effect: "NoSchedule" config: sources: pci: deviceClassWhitelist: - "02" - "0200" - "0207" - "0300" - "0302" deviceLabelFields: - vendor master: extraLabelNs: - nvidia.com serviceAccount: name: node-feature-discovery