apiVersion: v1 kind: Service metadata: name: nccl-host-1 spec: selector: name: nccl-host-1 clusterIP: None --- apiVersion: v1 kind: Service metadata: name: nccl-host-2 spec: selector: name: nccl-host-2 clusterIP: None --- apiVersion: v1 kind: Pod metadata: name: nccl-test-host-1 labels: name: nccl-host-1 tcpxo: daemon annotations: devices.gke.io/container.tcpxo-daemon: |+ - path: /dev/nvidia0 - path: /dev/nvidia1 - path: /dev/nvidia2 - path: /dev/nvidia3 - path: /dev/nvidia4 - path: /dev/nvidia5 - path: /dev/nvidia6 - path: /dev/nvidia7 - path: /dev/nvidiactl - path: /dev/nvidia-uvm - path: /dev/dmabuf_import_helper networking.gke.io/default-interface: 'eth0' networking.gke.io/interfaces: | [ {"interfaceName":"eth0","network":"default"}, {"interfaceName":"eth1","network":"vpc1"}, {"interfaceName":"eth2","network":"vpc2"}, {"interfaceName":"eth3","network":"vpc3"}, {"interfaceName":"eth4","network":"vpc4"}, {"interfaceName":"eth5","network":"vpc5"}, {"interfaceName":"eth6","network":"vpc6"}, {"interfaceName":"eth7","network":"vpc7"}, {"interfaceName":"eth8","network":"vpc8"} ] spec: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: tcpxo operator: In values: - daemon topologyKey: "kubernetes.io/hostname" nodeSelector: cloud.google.com/gke-accelerator: nvidia-h100-mega-80gb cloud.google.com/gke-spot: "true" cloud.google.com/gke-gpu-driver-version: latest topology.kubernetes.io/zone: us-central1-c hostname: host1 subdomain: nccl-host-1 containers: - name: tcpxo-daemon image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.11 imagePullPolicy: Always command: ["/bin/sh", "-c"] args: - | set -ex chmod 755 /fts/entrypoint_rxdm_container.sh /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr --enforce_kernel_ipv6_support=false securityContext: capabilities: add: - NET_ADMIN - NET_BIND_SERVICE volumeMounts: - name: nvidia mountPath: /usr/local/nvidia/lib64 - name: sys mountPath: /hostsysfs - name: proc-sys mountPath: /hostprocsysfs - name: nccl-test image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.5 imagePullPolicy: Always command: - /bin/sh - -c - | cat >/scripts/allgather.sh </scripts/allgather.sh <