# Default values for gpu-operator. # This is a YAML-formatted file. # Declare variables to be passed into your templates. platform: openshift: false nfd: enabled: true psp: enabled: false sandboxWorkloads: enabled: false defaultWorkload: "container" daemonsets: priorityClassName: system-node-critical tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands updateStrategy: "RollingUpdate" # configuration for controlling rolling update of GPU Operands rollingUpdate: # maximum number of nodes to simultaneously apply pod updates on. # can be specified either as number or percentage of nodes. Default 1. maxUnavailable: "1" validator: repository: nvcr.io/nvidia/cloud-native image: gpu-operator-validator # If version is not specified, then default is to use chart.AppVersion #version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] args: [] resources: {} plugin: env: - name: WITH_WORKLOAD value: "true" operator: repository: nvcr.io/nvidia image: gpu-operator # If version is not specified, then default is to use chart.AppVersion #version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] priorityClassName: system-node-critical defaultRuntime: docker runtimeClass: nvidia use_ocp_driver_toolkit: false # cleanup CRD on chart un-install cleanupCRD: false # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag # to be passed during helm upgrade. upgradeCRD: false initContainer: image: cuda repository: nvcr.io/nvidia version: 11.8.0-base-ubi8 imagePullPolicy: IfNotPresent tolerations: - key: "node-role.kubernetes.io/master" operator: "Equal" value: "" effect: "NoSchedule" annotations: openshift.io/scc: restricted-readonly affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 1 preference: matchExpressions: - key: "node-role.kubernetes.io/master" operator: In values: [""] logging: # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano') timeEncoding: epoch # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity level: info # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn) # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error) develMode: false resources: limits: cpu: 500m memory: 350Mi requests: cpu: 200m memory: 100Mi mig: strategy: single driver: enabled: true repository: nvcr.io/nvidia image: driver version: "525.60.13" imagePullPolicy: IfNotPresent imagePullSecrets: [] rdma: enabled: false useHostMofed: false manager: image: k8s-driver-manager repository: nvcr.io/nvidia/cloud-native version: v0.5.1 imagePullPolicy: IfNotPresent env: - name: ENABLE_GPU_POD_EVICTION value: "true" - name: ENABLE_AUTO_DRAIN value: "true" - name: DRAIN_USE_FORCE value: "false" - name: DRAIN_POD_SELECTOR_LABEL value: "" - name: DRAIN_TIMEOUT_SECONDS value: "0s" - name: DRAIN_DELETE_EMPTYDIR_DATA value: "false" env: [] resources: {} # Private mirror repository configuration repoConfig: configMapName: "" # custom ssl key/certificate configuration certConfig: name: "" # vGPU licensing configuration licensingConfig: configMapName: "" nlsEnabled: false # vGPU topology daemon configuration virtualTopology: config: "" # kernel module configuration for NVIDIA driver kernelModuleConfig: name: "" toolkit: enabled: true repository: nvcr.io/nvidia/k8s image: container-toolkit version: v1.11.0-ubuntu20.04 imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] resources: {} installDir: "/usr/local/nvidia" devicePlugin: enabled: true repository: nvcr.io/nvidia image: k8s-device-plugin version: v0.13.0-ubi8 imagePullPolicy: IfNotPresent imagePullSecrets: [] args: [] env: - name: PASS_DEVICE_SPECS value: "true" - name: FAIL_ON_INIT_ERROR value: "true" - name: DEVICE_LIST_STRATEGY value: envvar - name: DEVICE_ID_STRATEGY value: uuid - name: NVIDIA_VISIBLE_DEVICES value: all - name: NVIDIA_DRIVER_CAPABILITIES value: all resources: {} # Plugin configuration # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true). # Use "data" to build an integrated ConfigMap from a set of configurations as # part of this helm chart. An example of setting "data" might be: # config: # name: device-plugin-config # create: true # data: # default: |- # version: v1 # flags: # migStrategy: none # mig-single: |- # version: v1 # flags: # migStrategy: single # mig-mixed: |- # version: v1 # flags: # migStrategy: mixed config: # Create a ConfigMap (default: false) create: false # ConfigMap name (either exiting or to create a new one with create=true above) name: "" # Default config name within the ConfigMap default: "" # Data section for the ConfigMap to create (i.e only applies when create=true) data: {} # standalone dcgm hostengine dcgm: # disabled by default to use embedded nv-hostengine by exporter enabled: false repository: nvcr.io/nvidia/cloud-native image: dcgm version: 3.1.3-1-ubuntu20.04 imagePullPolicy: IfNotPresent hostPort: 5555 args: [] env: [] resources: {} dcgmExporter: enabled: true repository: nvcr.io/nvidia/k8s image: dcgm-exporter version: 3.1.3-3.1.2-ubuntu20.04 imagePullPolicy: IfNotPresent env: - name: DCGM_EXPORTER_LISTEN value: ":9400" - name: DCGM_EXPORTER_KUBERNETES value: "true" - name: DCGM_EXPORTER_COLLECTORS value: "/etc/dcgm-exporter/dcp-metrics-included.csv" resources: {} serviceMonitor: enabled: false interval: 15s honorLabels: false additionalLabels: {} gfd: enabled: true repository: nvcr.io/nvidia image: gpu-feature-discovery version: v0.7.0-ubi8 imagePullPolicy: IfNotPresent imagePullSecrets: [] env: - name: GFD_SLEEP_INTERVAL value: 60s - name: GFD_FAIL_ON_INIT_ERROR value: "true" resources: {} migManager: enabled: true repository: nvcr.io/nvidia/cloud-native image: k8s-mig-manager version: v0.5.0-ubuntu20.04 imagePullPolicy: IfNotPresent imagePullSecrets: [] env: - name: WITH_REBOOT value: "false" resources: {} config: name: "" gpuClientsConfig: name: "" nodeStatusExporter: enabled: false repository: nvcr.io/nvidia/cloud-native image: gpu-operator-validator # If version is not specified, then default is to use chart.AppVersion #version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] resources: {} gds: enabled: false repository: nvcr.io/nvidia/cloud-native image: nvidia-fs version: "2.14.13" imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] args: [] vgpuManager: enabled: false repository: "" image: vgpu-manager version: "" imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] resources: {} driverManager: image: k8s-driver-manager repository: nvcr.io/nvidia/cloud-native version: v0.5.1 imagePullPolicy: IfNotPresent env: - name: ENABLE_AUTO_DRAIN value: "false" vgpuDeviceManager: enabled: true repository: nvcr.io/nvidia/cloud-native image: vgpu-device-manager version: "v0.2.0" imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] config: name: "" default: "default" vfioManager: enabled: true repository: nvcr.io/nvidia image: cuda version: 11.7.1-base-ubi8 imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] resources: {} driverManager: image: k8s-driver-manager repository: nvcr.io/nvidia/cloud-native version: v0.5.1 imagePullPolicy: IfNotPresent env: - name: ENABLE_AUTO_DRAIN value: "false" sandboxDevicePlugin: enabled: true repository: nvcr.io/nvidia image: kubevirt-gpu-device-plugin version: v1.2.1 imagePullPolicy: IfNotPresent imagePullSecrets: [] args: [] env: [] resources: {} node-feature-discovery: worker: tolerations: - key: "node-role.kubernetes.io/master" operator: "Equal" value: "" effect: "NoSchedule" - key: nvidia.com/gpu operator: Exists effect: NoSchedule config: sources: pci: deviceClassWhitelist: - "02" - "0200" - "0207" - "0300" - "0302" deviceLabelFields: - vendor master: extraLabelNs: - nvidia.com serviceAccount: name: node-feature-discovery