# Copyright 2024 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: v1
kind: Service
metadata:
  name:  nccl-host-1
spec:
  selector:
    name:  nccl-host-1
  clusterIP: None
---
apiVersion: v1
kind: Service
metadata:
  name: nccl-host-2
spec:
  selector:
    name: nccl-host-2
  clusterIP: None
---
apiVersion: v1
kind: Pod
metadata:
  name: nccl-test-host-1
  labels:
    name: nccl-host-1
    tcpxo: daemon
  annotations:
    devices.gke.io/container.tcpxo-daemon: |+
      - path: /dev/nvidia0
      - path: /dev/nvidia1
      - path: /dev/nvidia2
      - path: /dev/nvidia3
      - path: /dev/nvidia4
      - path: /dev/nvidia5
      - path: /dev/nvidia6
      - path: /dev/nvidia7
      - path: /dev/nvidiactl
      - path: /dev/nvidia-uvm
      - path: /dev/dmabuf_import_helper
    networking.gke.io/default-interface: 'eth0'
    networking.gke.io/interfaces: |
      [
        {"interfaceName":"eth0","network":"default"},
        {"interfaceName":"eth1","network":"vpc1"},
        {"interfaceName":"eth2","network":"vpc2"},
        {"interfaceName":"eth3","network":"vpc3"},
        {"interfaceName":"eth4","network":"vpc4"},
        {"interfaceName":"eth5","network":"vpc5"},
        {"interfaceName":"eth6","network":"vpc6"},
        {"interfaceName":"eth7","network":"vpc7"},
        {"interfaceName":"eth8","network":"vpc8"}
      ]
spec:
  affinity:
    podAntiAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
      - labelSelector:
          matchExpressions:
          - key: tcpxo
            operator: In
            values:
            - daemon
        topologyKey: "kubernetes.io/hostname"
  hostname: host1
  subdomain: nccl-host-1
  #  hostNetwork: true
  #  dnsPolicy: ClusterFirstWithHostNet
  containers:
    - name: tcpxo-daemon
      image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14
      imagePullPolicy: Always
      command: ["/bin/sh", "-c"]
      args:
        - |
          set -ex
          chmod 755 /fts/entrypoint_rxdm_container.sh
          /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
      securityContext:
        #        privileged: true
        capabilities:
          add:
            - NET_ADMIN
            - NET_BIND_SERVICE
      volumeMounts:
        - name: nvidia
          mountPath: /usr/local/nvidia/lib64
        - name: sys
          mountPath: /hostsysfs
        - name: proc-sys
          mountPath: /hostprocsysfs
      env:
        - name: LD_LIBRARY_PATH
          value: /usr/local/nvidia/lib64
    - name: nccl-test
      image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1
      imagePullPolicy: Always
      #      securityContext:
      #        privileged: true
      command:
        - /bin/sh
        - -c
        - |
          set -ex
          chmod 755  /scripts/demo-run-nccl-test-tcpxo-via-mpi.sh
          cat >/scripts/allgather.sh <<EOF
          #!/bin/bash
          /scripts/init_ssh.sh \${@};
          pushd /scripts;
          /scripts/gen_hostfiles.sh \${@};
          popd;
          BENCHMARK=all_gather_perf NHOSTS=2 NCCL_LIB_DIR="${LD_LIBRARY_PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" /scripts/demo-run-nccl-test-tcpxo-via-mpi.sh
          EOF
          chmod +x /scripts/allgather.sh
          service ssh restart;
          sleep infinity;
      env:
        - name: LD_LIBRARY_PATH
          value: /usr/local/nvidia/lib64
        - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
          value: /dev/aperture_devices
      volumeMounts:
        - name: nvidia
          mountPath: /usr/local/nvidia/lib64
        - name: shared-memory
          mountPath: /dev/shm
        - name: aperture-devices
          mountPath: /dev/aperture_devices
      resources:
        limits:
          nvidia.com/gpu: 8
  volumes:
    - name: nvidia
      hostPath:
        path: /home/kubernetes/bin/nvidia/lib64
    - name: shared-memory
      emptyDir:
        medium: "Memory"
        sizeLimit: 1Gi
    - name: sys
      hostPath:
        path: /sys
    - name: proc-sys
      hostPath:
        path: /proc/sys
    - name: aperture-devices
      hostPath:
        path: /dev/aperture_devices

---
apiVersion: v1
kind: Pod
metadata:
  name: nccl-test-host-2
  labels:
    name: nccl-host-2
    tcpxo: daemon
  annotations:
    devices.gke.io/container.tcpxo-daemon: |+
      - path: /dev/nvidia0
      - path: /dev/nvidia1
      - path: /dev/nvidia2
      - path: /dev/nvidia3
      - path: /dev/nvidia4
      - path: /dev/nvidia5
      - path: /dev/nvidia6
      - path: /dev/nvidia7
      - path: /dev/nvidiactl
      - path: /dev/nvidia-uvm
      - path: /dev/dmabuf_import_helper
    networking.gke.io/default-interface: 'eth0'
    networking.gke.io/interfaces: |
      [
        {"interfaceName":"eth0","network":"default"},
        {"interfaceName":"eth1","network":"vpc1"},
        {"interfaceName":"eth2","network":"vpc2"},
        {"interfaceName":"eth3","network":"vpc3"},
        {"interfaceName":"eth4","network":"vpc4"},
        {"interfaceName":"eth5","network":"vpc5"},
        {"interfaceName":"eth6","network":"vpc6"},
        {"interfaceName":"eth7","network":"vpc7"},
        {"interfaceName":"eth8","network":"vpc8"}
      ]
spec:
  affinity:
    podAntiAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
      - labelSelector:
          matchExpressions:
          - key: tcpxo
            operator: In
            values:
            - daemon
        topologyKey: "kubernetes.io/hostname"
  hostname: host2
  subdomain: nccl-host-2
  #  hostNetwork: true
  #  dnsPolicy: ClusterFirstWithHostNet
  containers:
    - name: tcpxo-daemon
      image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14
      imagePullPolicy: Always
      command: ["/bin/sh", "-c"]
      args:
        - |
          set -ex
          chmod 755 /fts/entrypoint_rxdm_container.sh
          /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
      securityContext:
        #        privileged: true
        capabilities:
          add:
            - NET_ADMIN
            - NET_BIND_SERVICE
      volumeMounts:
        - name: nvidia
          mountPath: /usr/local/nvidia/lib64
        - name: sys
          mountPath: /hostsysfs
        - name: proc-sys
          mountPath: /hostprocsysfs
      env:
        - name: LD_LIBRARY_PATH
          value: /usr/local/nvidia/lib64
    - name: nccl-test
      image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1
      imagePullPolicy: Always
      #      securityContext:
      #        privileged: true
      command:
        - /bin/sh
        - -c
        - |
          set -ex
          chmod 755  /scripts/demo-run-nccl-test-tcpxo-via-mpi.sh
          cat >/scripts/allgather.sh <<EOF
          #!/bin/bash
          /scripts/init_ssh.sh \${@};
          pushd /scripts;
          /scripts/gen_hostfiles.sh \${@};
          popd;
          BENCHMARK=all_gather_perf NHOSTS=2 NCCL_LIB_DIR="${LD_LIBRARY_PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" /scripts/demo-run-nccl-test-tcpxo-via-mpi.sh
          EOF
          chmod +x /scripts/allgather.sh
          service ssh restart;
          sleep infinity;
      env:
        - name: LD_LIBRARY_PATH
          value: /usr/local/nvidia/lib64
        - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
          value: /dev/aperture_devices
      volumeMounts:
        - name: nvidia
          mountPath: /usr/local/nvidia/lib64
        - name: shared-memory
          mountPath: /dev/shm
        - name: aperture-devices
          mountPath: /dev/aperture_devices
      resources:
        limits:
          nvidia.com/gpu: 8
  volumes:
    - name: nvidia
      hostPath:
        path: /home/kubernetes/bin/nvidia/lib64
    - name: shared-memory
      emptyDir:
        medium: "Memory"
        sizeLimit: 1Gi
    - name: sys
      hostPath:
        path: /sys
    - name: proc-sys
      hostPath:
        path: /proc/sys
    - name: aperture-devices
      hostPath:
        path: /dev/aperture_devices