apiVersion: v1 kind: Service metadata: name: nccl-host-1 spec: selector: name: nccl-host-1 clusterIP: None --- apiVersion: v1 kind: Service metadata: name: nccl-host-2 spec: selector: name: nccl-host-2 clusterIP: None --- apiVersion: v1 kind: Pod metadata: name: nccl-test-host-1 labels: name: nccl-host-1 spec: hostname: host1 subdomain: nccl-host-1 hostNetwork: true dnsPolicy: ClusterFirstWithHostNet containers: - name: tcpxo-daemon image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.8 imagePullPolicy: Always command: ["/bin/sh", "-c"] args: - | set -ex chmod 755 /fts/entrypoint_rxdm_container.sh /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr securityContext: privileged: true volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 - name: nccl-test image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.2 imagePullPolicy: Always command: - /bin/sh - -c - | cat >/scripts/allgather.sh </scripts/allgather.sh <