# Custom slurmd image with CUDA, NCCL, nccl-tests, RDMA userspace, # Pyxis, Enroot, and nsscache (libnss-cache + nsscache python tool) # for GPU training validation and container-aware Slurm jobs on # DOKS B300/H100 nodes. # # Multi-stage build: # Stage 1 (builder): compile nccl-tests using official nvidia/cuda devel image # Stage 2 (pyxis-builder): compile Pyxis SPANK plugin against matching Slurm spank.h # Stage 3 (libnss-cache-bld): build libnss-cache (NSS module) # Stage 4 (nsscache-builder): install the nsscache Python tool # Stage 5 (enroot-stage): download Enroot .deb artifacts # Stage 6 (final): slinky slurmd base + RDMA tools + NVIDIA # container toolkit + everything above # # CUDA_IMAGE controls the CUDA version used for compilation. # The runtime CUDA driver comes from the host node — not baked into the image. # # B300 is Blackwell Ultra (compute capability sm_103). NVIDIA added sm_103 # codegen in CUDA 12.9, so 12.9 is the minimum toolkit that can build native # Blackwell-Ultra binaries. Do not downgrade below 12.9 for B300 nodes. # ARG CUDA_IMAGE=nvidia/cuda:12.9.0-devel-ubuntu24.04 ARG CUDA_IMAGE=nvidia/cuda:13.1.2-devel-ubuntu24.04 # Must match the Slurm version of the slinky base image below so the # Pyxis SPANK plugin links against a compatible spank.h ABI. ARG SLURM_VERSION=25.11.5 ARG PYXIS_VERSION=0.21.0 ARG ENROOT_VERSION=4.0.1 ARG SLURM_PMIX_VERSION=pmix2 # Ubuntu 24.04 ships Python 3.12; this controls where the nsscache python # package gets staged from the builder. ARG PYTHON_VERSION=3.12 # libnss-cache release tag (https://github.com/google/libnss-cache) ARG LIBNSSCACHE_VERSION=0.21 # nsscache (the Python tool) — pinned to CoreWeave fork commit/tag that # fixes an occasional missing `modifyTimestamp` field in LDAP responses. ARG NSSCACHE_VERSION=v0.7.0 # Must match the `nsscache.nsscacheConfig.default.files_dir` helm setting. ARG NSSCACHE_FILES_DIR=/etc/nsscache # ── Stage 1: Build nccl-tests ──────────────────────────────────────────────── FROM --platform=linux/amd64 ${CUDA_IMAGE} AS builder # NCCL is NOT preinstalled in nvidia/cuda:13.x devel images (it was in # 12.x), so install libnccl-dev explicitly. Pin the +cuda13.1 build of NCCL # so it matches this toolkit's CUDA version. RUN apt-get update && apt-get install -y --no-install-recommends \ git \ build-essential \ libopenmpi-dev \ openmpi-bin \ libnccl2=*+cuda13.1 \ libnccl-dev=*+cuda13.1 \ && rm -rf /var/lib/apt/lists/* # NVCC_GENCODE emits native code for Blackwell Ultra (sm_103, B300) and # Blackwell (sm_100, B200), an sm_90 (Hopper) fallback, plus forward-compat # PTX so the binaries also JIT onto future architectures. RUN git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git /tmp/nccl-tests && \ cd /tmp/nccl-tests && \ make MPI=1 \ MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi \ CUDA_HOME=/usr/local/cuda \ NCCL_HOME=/usr \ NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100 -gencode=arch=compute_103,code=sm_103 -gencode=arch=compute_103,code=compute_103" && \ cp build/all_reduce_perf \ build/reduce_scatter_perf \ build/all_gather_perf \ /usr/local/bin/ && \ rm -rf /tmp/nccl-tests # ── Shared lightweight builder for plugin/tool compilation ─────────────── FROM --platform=linux/amd64 ubuntu:24.04 AS toolbuilder ARG DEBIAN_FRONTEND=noninteractive RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ build-essential \ ca-certificates \ curl \ git \ libcap-dev \ libtool \ autoconf \ automake \ pkg-config \ python3-dev \ python3-pip \ python3-setuptools \ python3-wheel \ # python-ldap (nsscache dep) C extension build deps: libldap2-dev \ libsasl2-dev \ libssl-dev \ # pycurl (nsscache dep) C extension build deps: libcurl4-openssl-dev \ tar \ xz-utils \ && rm -rf /var/lib/apt/lists/* # ── Stage 2: Build Pyxis ───────────────────────────────────────────────────── # Pyxis is a SPANK plugin; it only needs the `slurm/` include tree (notably # `spank.h` and the configure-generated `slurm_version.h`). Pull those # headers directly from the slinky slurmd image so the plugin is guaranteed # to match the runtime Slurm ABI. FROM --platform=linux/amd64 ghcr.io/slinkyproject/slurmd:25.11.5-ubuntu24.04 AS slurm-headers FROM toolbuilder AS pyxis-builder ARG PYXIS_VERSION COPY --from=slurm-headers /usr/include/slurm /usr/include/slurm WORKDIR /build RUN curl --fail --show-error --silent --location --remote-header-name \ --remote-name https://github.com/NVIDIA/pyxis/archive/refs/tags/v${PYXIS_VERSION}.tar.gz && \ tar -xf pyxis-${PYXIS_VERSION}.tar.gz && \ mkdir -p /build/pyxis-${PYXIS_VERSION}/INSTALLDIR && \ cd /build/pyxis-${PYXIS_VERSION} && \ make -j "$(nproc)" install DESTDIR=/build/pyxis-${PYXIS_VERSION}/INSTALLDIR # ── Stage 3: Build libnss-cache (NSS module) ───────────────────────────────── FROM toolbuilder AS libnss-cache-builder ARG LIBNSSCACHE_VERSION WORKDIR /build RUN git clone https://github.com/google/libnss-cache.git && \ cd libnss-cache && \ git checkout version/${LIBNSSCACHE_VERSION} && \ make install # ── Stage 4: Install nsscache (Python tool) ────────────────────────────────── FROM toolbuilder AS nsscache-builder ARG NSSCACHE_VERSION WORKDIR /build RUN git clone https://github.com/coreweave/nsscache.git && \ cd nsscache && \ git checkout ${NSSCACHE_VERSION} && \ find /usr/lib/python3* -name EXTERNALLY-MANAGED -delete && \ pip install --break-system-packages -r requirements.txt && \ pip install --break-system-packages . # ── Stage 5: Stage Enroot .deb artifacts ───────────────────────────────────── FROM --platform=linux/amd64 ubuntu:24.04 AS enroot-stage ARG ENROOT_VERSION ARG DEBIAN_FRONTEND=noninteractive RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ ca-certificates curl && \ rm -rf /var/lib/apt/lists/* WORKDIR /build RUN arch=$(dpkg --print-architecture) && \ curl --fail --show-error --silent --location \ --remote-name https://github.com/NVIDIA/enroot/releases/download/v${ENROOT_VERSION}/enroot-hardened_${ENROOT_VERSION}-1_${arch}.deb \ --remote-name https://github.com/NVIDIA/enroot/releases/download/v${ENROOT_VERSION}/enroot-hardened+caps_${ENROOT_VERSION}-1_${arch}.deb # ── Stage 6: Slinky slurmd base ────────────────────────────────────────────── FROM --platform=linux/amd64 ghcr.io/slinkyproject/slurmd:25.11.5-ubuntu24.04 USER root # First pass: install base utilities + register the NVIDIA container toolkit # *and* the CUDA apt sources. The NVIDIA container toolkit repo is needed by # enroot's 98-nvidia.sh hook (it shells out to nvidia-container-cli). The # CUDA repo lets us install libnccl2 / cuda-nvcc / cuda-cudart from official # packages (rather than copying .so files out of the builder), which is what # ClusterMAX's `dpkg -l | grep nccl` check expects. RUN apt-get update && \ apt-get install -y --no-install-recommends \ ca-certificates \ curl \ gnupg && \ install -d -m 0755 /usr/share/keyrings /etc/apt/sources.list.d && \ curl --fail --show-error --silent --location \ https://nvidia.github.io/libnvidia-container/gpgkey \ | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \ chmod 0644 /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \ curl --fail --show-error --silent --location \ https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ > /etc/apt/sources.list.d/nvidia-container-toolkit.list && \ curl --fail --show-error --silent --location --remote-name \ https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \ dpkg -i cuda-keyring_1.1-1_all.deb && \ rm -f cuda-keyring_1.1-1_all.deb && \ apt-get update && \ rm -rf /var/lib/apt/lists/* # Second pass: RDMA userspace + networking diagnostics + Enroot/Pyxis/nsscache # runtime dependencies (including the NVIDIA container toolkit packages just # registered above). RUN apt-get update && apt-get install -y --no-install-recommends \ # RDMA userspace + diagnostics. infiniband-diags provides `ibstat`, # which ClusterMAX requires alongside `rdma` and `ib_write_bw`. libibverbs-dev \ rdma-core \ ibverbs-utils \ infiniband-diags \ perftest \ iproute2 \ libopenmpi-dev \ openmpi-bin \ nfs-common \ rsync \ s3cmd \ sysstat \ unzip \ # ClusterMAX essential userland (python3/curl/wget already present) git \ nano \ vim \ wget \ xz-utils \ # Enroot runtime dependencies bash \ jq \ parallel \ squashfs-tools \ zstd \ bsdmainutils \ pigz \ libcap2-bin \ patch \ # NVIDIA container toolkit (for nvidia-container-cli used by enroot) nvidia-container-toolkit \ nvidia-container-toolkit-base \ libnvidia-container-tools \ libnvidia-container1 \ # CUDA toolkit + NCCL from NVIDIA's apt repo so ClusterMAX's # `dpkg -l | grep nccl` check passes and `nvcc` is on PATH. Pin to # the +cuda13.1 NCCL build to match the toolkit version used by the # nccl-tests builder stage above. cuda-nvcc-13-1 \ cuda-cudart-13-1 \ cuda-cudart-dev-13-1 \ libnccl2=*+cuda13.1 \ libnccl-dev=*+cuda13.1 \ # nsscache python runtime + LDAP bindings python3 \ python3-ldap \ python3-pycurl \ && rm -rf /var/lib/apt/lists/* # Copy compiled benchmark binaries. NCCL runtime (libnccl.so*) and CUDA # runtime (libcudart.so*) are now installed via apt (libnccl2 + cuda-cudart-*) # in the second-pass install above, so we don't copy those .so files from # the builder anymore — that lets ClusterMAX's `dpkg -l | grep nccl` check # see a registered package. COPY --from=builder /usr/local/bin/all_reduce_perf \ /usr/local/bin/reduce_scatter_perf \ /usr/local/bin/all_gather_perf \ /usr/local/bin/ # ── Install Enroot ─────────────────────────────────────────────────────────── ARG ENROOT_VERSION COPY --from=enroot-stage /build/ /tmp/enroot/ COPY images/patches/enroot/ /build/patches/enroot/ RUN arch=$(dpkg --print-architecture) && \ apt-get update && \ apt-get install -y --no-install-recommends \ /tmp/enroot/enroot-hardened_${ENROOT_VERSION}-1_${arch}.deb \ /tmp/enroot/enroot-hardened+caps_${ENROOT_VERSION}-1_${arch}.deb && \ # The default --ldconfig flag breaks under enroot's container ldconfig # detection in some hosts; upstream slurm-containers strips it here too. sed -i 's/"--ldconfig=@$(command -v ldconfig.real || command -v ldconfig)"//' /etc/enroot/hooks.d/98-nvidia.sh && \ if ls /build/patches/enroot/*.patch >/dev/null 2>&1; then \ find /build/patches/enroot -type f -name '*.patch' -print0 | \ sort -z | \ xargs -t -0 -r -n 1 patch -p2 -d /usr/lib/enroot -i ; \ fi && \ rm -rf /tmp/enroot /build/patches /var/lib/apt/lists/* # ── Install Pyxis SPANK plugin ─────────────────────────────────────────────── # Note: plugstack.conf is managed by the helm chart (compute.pyxis.enabled), # so we deliberately don't register the plugin here. ARG PYXIS_VERSION COPY --from=pyxis-builder /build/pyxis-${PYXIS_VERSION}/INSTALLDIR/ / # ── Install libnss-cache (NSS module) ──────────────────────────────────────── COPY --from=libnss-cache-builder /usr/lib/libnss_cache* /usr/lib/ RUN rm -f /lib/libnss_cache.so.2 && \ ln -sf /lib/libnss_cache.so.2.0 /lib/libnss_cache.so.2 || true # ── Install nsscache (Python tool) ─────────────────────────────────────────── ARG PYTHON_VERSION COPY --from=nsscache-builder /usr/local/bin/nsscache /usr/local/bin/nsscache COPY --from=nsscache-builder /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages # Symlink the NSS cache files into /etc so libnss_cache.so finds them at the # canonical NSS lookup paths (passwd.cache, group.cache, shadow.cache, sshkey.cache). ARG NSSCACHE_FILES_DIR RUN mkdir -p ${NSSCACHE_FILES_DIR} && \ for f in passwd group shadow sshkey; do \ ln -sf ${NSSCACHE_FILES_DIR}/$f.cache /etc/$f.cache ; \ done RUN ldconfig # ── NCCL runtime defaults (ClusterMAX RoCEv2 expectation) ───────────────── # Set NCCL_IB_GID_INDEX=3 so NCCL picks the RoCEv2 GID on Mellanox HCAs # (RoCEv1 is at GID index 0; RoCEv2 is at index 3 on the dual-stack default). # Deliberately do NOT set NCCL_MIN_NCHANNELS / NCCL_PROTO / NCCL_ALGO here — # ClusterMAX explicitly checks that those are NOT overridden in the image so # NCCL's autotuner can pick the best values per-fabric. RUN printf '%s\n' 'NCCL_IB_GID_INDEX=3' > /etc/nccl.conf # Put nvcc and the rest of the CUDA toolkit binaries on PATH so users can # build CUDA code directly inside the container without sourcing anything. ENV PATH="/usr/local/cuda/bin:${PATH}" ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}"