apiVersion: apps/v1 kind: DaemonSet metadata: name: nccl-tcpxo-installer namespace: kube-system labels: k8s-app: nccl-tcpxo-installer spec: selector: matchLabels: k8s-app: nccl-tcpxo-installer updateStrategy: type: RollingUpdate template: metadata: labels: name: nccl-tcpxo-installer k8s-app: nccl-tcpxo-installer spec: priorityClassName: system-node-critical affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: cloud.google.com/gke-accelerator operator: In values: - nvidia-h100-mega-80gb tolerations: - operator: "Exists" hostNetwork: true hostPID: true volumes: - name: var-lib hostPath: path: /var/lib - name: tcpxo hostPath: path: /var/lib/tcpxo - name: library-dir-host hostPath: path: /home/kubernetes/bin initContainers: - image: "ubuntu" name: pre-installation securityContext: privileged: true command: - nsenter - -at - '1' - -- - sh - -c - | /sbin/iptables -I INPUT -p tcp -m tcp -j ACCEPT && modprobe import-helper sudo mkdir -p /dev/aperture_devices while IFS= read -r line; do BDF=$( echo "$line" | awk '{print $1}' ); target_aperture_path="/dev/aperture_devices/$BDF" host_aperture_device=$(readlink -f "/sys/bus/pci/devices/$BDF"); sudo mkdir -p $target_aperture_path; sudo umount -R $target_aperture_path; sudo mount --bind $host_aperture_device $target_aperture_path; done < <(lspci -nn -D | grep '1ae0:0084') if [ -d /dev/aperture_devices ]; then chmod -R a+r /dev/aperture_devices/ chmod a+rw /dev/aperture_devices/*/resource* fi - name: nccl-tcpxo-installer image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.3 resources: requests: cpu: 150m securityContext: privileged: true volumeMounts: - name: var-lib mountPath: /var/lib - name: library-dir-host mountPath: /usr/local command: ["/bin/sh", "-c"] args: - | set -ex chmod 755 /scripts/container_entry.sh /scripts/container_entry.sh install --install-nccl mkdir -p /usr/local/nvidia/lib64 cp -r /var/lib/tcpxo/lib64/. /usr/local/nvidia/lib64 echo "installation finishes" containers: - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830" name: pause