# Copyright 2024 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: v1 kind: Service metadata: name: nccl-host-1 spec: selector: name: nccl-host-1 clusterIP: None --- apiVersion: v1 kind: Service metadata: name: nccl-host-2 spec: selector: name: nccl-host-2 clusterIP: None --- apiVersion: v1 kind: Pod metadata: name: nccl-test-host-1 labels: name: nccl-host-1 annotations: devices.gke.io/container.tcpxo-daemon: |+ - path: /dev/nvidia0 - path: /dev/nvidia1 - path: /dev/nvidia2 - path: /dev/nvidia3 - path: /dev/nvidia4 - path: /dev/nvidia5 - path: /dev/nvidia6 - path: /dev/nvidia7 - path: /dev/nvidiactl - path: /dev/nvidia-uvm - path: /dev/dmabuf_import_helper networking.gke.io/default-interface: 'eth0' networking.gke.io/interfaces: | [ {"interfaceName":"eth0","network":"default"}, {"interfaceName":"eth1","network":"vpc1"}, {"interfaceName":"eth2","network":"vpc2"}, {"interfaceName":"eth3","network":"vpc3"}, {"interfaceName":"eth4","network":"vpc4"}, {"interfaceName":"eth5","network":"vpc5"}, {"interfaceName":"eth6","network":"vpc6"}, {"interfaceName":"eth7","network":"vpc7"}, {"interfaceName":"eth8","network":"vpc8"} ] spec: hostname: host1 subdomain: nccl-host-1 # hostNetwork: true # dnsPolicy: ClusterFirstWithHostNet containers: - name: tcpxo-daemon image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.9 imagePullPolicy: Always command: ["/bin/sh", "-c"] args: - | set -ex chmod 755 /fts/entrypoint_rxdm_container.sh /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr securityContext: # privileged: true capabilities: add: - CAP_NET_ADMIN - CAP_NET_BIND_SERVICE volumeMounts: - name: nvidia mountPath: /usr/local/nvidia/lib64 - name: sys mountPath: /hostsysfs - name: proc-sys mountPath: /hostprocsysfs env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 - name: nccl-test image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.3 imagePullPolicy: Always # securityContext: # privileged: true command: - /bin/sh - -c - | set -ex chmod 755 /scripts/demo-run-nccl-test-tcpxo-via-mpi.sh cat >/scripts/allgather.sh </scripts/allgather.sh <