apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: nccl-tcpxo-installer
  namespace: kube-system
  labels:
    k8s-app: nccl-tcpxo-installer
spec:
  selector:
    matchLabels:
      k8s-app: nccl-tcpxo-installer
  updateStrategy:
    type: RollingUpdate
  template:
    metadata:
      labels:
        name: nccl-tcpxo-installer
        k8s-app: nccl-tcpxo-installer
    spec:
      priorityClassName: system-node-critical
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: cloud.google.com/gke-accelerator
                    operator: In
                    values:
                      - nvidia-h100-mega-80gb
      tolerations:
        - operator: "Exists"
      hostNetwork: true
      hostPID: true
      volumes:
        - name: var-lib
          hostPath:
            path: /var/lib
        - name: tcpxo
          hostPath:
            path: /var/lib/tcpxo
        - name: library-dir-host
          hostPath:
            path: /home/kubernetes/bin
      initContainers:
        - image: "ubuntu"
          name: pre-installation
          securityContext:
            privileged: true
          command:
            - nsenter
            - -at
            - '1'
            - --
            - sh
            - -c
            - |
              /sbin/iptables -I INPUT -p tcp -m tcp -j ACCEPT && modprobe import-helper
              sudo mkdir -p /dev/aperture_devices
              while IFS= read -r line; do
                BDF=$( echo "$line" | awk '{print $1}' );
                target_aperture_path="/dev/aperture_devices/$BDF"
                host_aperture_device=$(readlink -f "/sys/bus/pci/devices/$BDF");
                sudo mkdir -p $target_aperture_path;
                sudo umount -R $target_aperture_path;
                sudo mount --bind $host_aperture_device $target_aperture_path;
              done < <(lspci -nn -D | grep '1ae0:0084')
              if [ -d /dev/aperture_devices ]; then
                  chmod -R a+r /dev/aperture_devices/
                  chmod a+rw /dev/aperture_devices/*/resource*
              fi
        - name: nccl-tcpxo-installer
          image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1
          resources:
            requests:
              cpu: 150m
          securityContext:
            privileged: true
          volumeMounts:
            - name: var-lib
              mountPath: /var/lib
            - name: library-dir-host
              mountPath: /usr/local
          command: ["/bin/sh", "-c"]
          args:
            - |
              set -ex
              chmod 755 /scripts/container_entry.sh
              /scripts/container_entry.sh install --install-nccl
              mkdir -p /usr/local/nvidia/lib64
              cp -r /var/lib/tcpxo/lib64/. /usr/local/nvidia/lib64
              echo "installation finishes"
      containers:
        - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
          name: pause