#!/usr/bin/env bash # bootstrap_dsv4_spark.sh — zero-to-serving for DSV4-Flash W4A16-FP8 on dual DGX Spark TP=2. # # Run from anywhere with SSH access to both Sparks. Idempotent — safe to re-run. # # Usage: # ./bootstrap_dsv4_spark.sh \ # --head-host spark-a \ # --worker-host spark-b \ # [--head-qsfp-ip 192.168.101.1] \ # [--worker-qsfp-ip 192.168.101.2] \ # [--qsfp-ifname enp1s0f0np0] \ # [--vllm-ref ds4-sm120-experimental] \ # [--image-tag vllm-w4a16-dsv4:exp] \ # [--max-model-len 1048576] \ # [--max-num-seqs 1] \ # [--gpu-mem-util 0.90] \ # [--ssh-user $USER] \ # [--skip-build] [--skip-network] [--skip-download] # # What it does (in order): # 1. SSH-reachability check for both hosts # 2. Pre-download canada-quant/DeepSeek-V4-Flash-W4A16-FP8 on both (always idempotent via huggingface-cli) # 3. Configure QSFP /30 on both (skippable) # 4. Build vllm-w4a16-dsv4:exp on the HEAD box (skippable; uses build-and-copy.sh) # 5. Distribute the image to WORKER (folded into step 4) # 6. Stop any existing vllm_node containers # 7. Launch worker rank 1 (--headless) # 8. Launch head rank 0 # 9. Wait for /health=200, print build provenance + smoke test command set -euo pipefail # ---------------- defaults ---------------- HEAD_HOST="" WORKER_HOST="" HEAD_QSFP_IP="192.168.101.1" WORKER_QSFP_IP="192.168.101.2" QSFP_IFNAME="enp1s0f0np0" VLLM_REF="ds4-sm120-experimental" IMAGE_TAG="vllm-w4a16-dsv4:exp" MAX_MODEL_LEN="1048576" MAX_NUM_SEQS="1" GPU_MEM_UTIL="0.90" SSH_USER="${USER}" SKIP_BUILD=0 SKIP_NETWORK=0 SKIP_DOWNLOAD=0 while [[ $# -gt 0 ]]; do case "$1" in --head-host) HEAD_HOST="$2"; shift 2;; --worker-host) WORKER_HOST="$2"; shift 2;; --head-qsfp-ip) HEAD_QSFP_IP="$2"; shift 2;; --worker-qsfp-ip) WORKER_QSFP_IP="$2"; shift 2;; --qsfp-ifname) QSFP_IFNAME="$2"; shift 2;; --vllm-ref) VLLM_REF="$2"; shift 2;; --image-tag) IMAGE_TAG="$2"; shift 2;; --max-model-len) MAX_MODEL_LEN="$2"; shift 2;; --max-num-seqs) MAX_NUM_SEQS="$2"; shift 2;; --gpu-mem-util) GPU_MEM_UTIL="$2"; shift 2;; --ssh-user) SSH_USER="$2"; shift 2;; --skip-build) SKIP_BUILD=1; shift;; --skip-network) SKIP_NETWORK=1; shift;; --skip-download) SKIP_DOWNLOAD=1; shift;; -h|--help) sed -n '2,30p' "$0"; exit 0;; *) echo "unknown arg: $1" >&2; exit 2;; esac done [[ -z "$HEAD_HOST" || -z "$WORKER_HOST" ]] && { echo "--head-host and --worker-host required"; exit 2; } H="${SSH_USER}@${HEAD_HOST}" W="${SSH_USER}@${WORKER_HOST}" SSH_OPTS="-o ConnectTimeout=10 -o StrictHostKeyChecking=accept-new" DSV4_REPO_RAW="https://raw.githubusercontent.com/canada-quant/dsv4-flash-w4a16-fp8/main" log() { echo "[$(date +%H:%M:%S)] $*"; } # Failure diagnostics — called when a container exits before /health=200. # Dumps enough state for the user to file a useful bug report. dump_failure_diag() { local which="$1" # "head" or "worker" local target_host if [[ "$which" == "head" ]]; then target_host="$H"; else target_host="$W"; fi echo "" echo "================================================================" echo "FAILURE DIAGNOSTICS — ${which} (${target_host})" echo "================================================================" echo "--- last 300 lines of vllm_node logs ---" ssh $SSH_OPTS "$target_host" 'docker logs --tail 300 vllm_node 2>&1' || true echo "" echo "--- container env (VLLM_*, NCCL_*, TILELANG_*, HF_*, TORCH_*) ---" ssh $SSH_OPTS "$target_host" "docker exec vllm_node sh -c 'env | grep -E \"^(VLLM|NCCL|TILELANG|HF|TORCH)_\" | sort' 2>/dev/null || \ docker inspect --format '{{range .Config.Env}}{{println .}}{{end}}' vllm_node 2>/dev/null | grep -E '^(VLLM|NCCL|TILELANG|HF|TORCH)_' | sort" || true echo "" echo "--- build provenance from image ---" ssh $SSH_OPTS "$target_host" "docker run --rm --entrypoint cat ${IMAGE_TAG} /workspace/build-metadata.yaml 2>/dev/null || echo '(no build-metadata.yaml — image predates instrumentation)'" || true echo "" echo "--- nvidia-smi ---" ssh $SSH_OPTS "$target_host" 'nvidia-smi 2>&1 | head -40' || true echo "" echo "--- dmesg tail (last 50 lines, may need sudo) ---" ssh $SSH_OPTS "$target_host" 'sudo -n dmesg 2>/dev/null | tail -50 || dmesg 2>/dev/null | tail -50 || echo "(dmesg unavailable without sudo)"' || true echo "================================================================" } # ---------------- 1. SSH reach ---------------- log "[1/9] SSH reachability check..." ssh $SSH_OPTS "$H" 'true' || { echo "cannot SSH to head $H"; exit 3; } ssh $SSH_OPTS "$W" 'true' || { echo "cannot SSH to worker $W"; exit 3; } log " ok — both Sparks reachable" # ---------------- 2. Pre-download model ---------------- # Always invoke huggingface-cli download — it's idempotent and resumes via xet. # The previous "is any .safetensors present?" gate passed on half-cached models # and produced engines that crashed in kernel warmup on corrupt KV-cache tensors. if [[ $SKIP_DOWNLOAD -eq 0 ]]; then log "[2/9] Ensuring canada-quant/DeepSeek-V4-Flash-W4A16-FP8 is fully cached on both Sparks (~143 GiB)..." for HOST in "$H" "$W"; do ssh $SSH_OPTS "$HOST" ' set -e command -v huggingface-cli >/dev/null 2>&1 || pip install --quiet --user huggingface_hub huggingface-cli download canada-quant/DeepSeek-V4-Flash-W4A16-FP8 >/dev/null echo " [$(hostname)] cache verified" ' done else log "[2/9] skipping model download (--skip-download)" fi # ---------------- 3. QSFP network ---------------- if [[ $SKIP_NETWORK -eq 0 ]]; then log "[3/9] Configuring QSFP /30 on both Sparks..." ssh $SSH_OPTS "$H" " sudo ip addr replace ${HEAD_QSFP_IP}/30 dev ${QSFP_IFNAME} sudo ip link set dev ${QSFP_IFNAME} mtu 9000 sudo ip link set dev ${QSFP_IFNAME} up " ssh $SSH_OPTS "$W" " sudo ip addr replace ${WORKER_QSFP_IP}/30 dev ${QSFP_IFNAME} sudo ip link set dev ${QSFP_IFNAME} mtu 9000 sudo ip link set dev ${QSFP_IFNAME} up " log " verifying connectivity..." ssh $SSH_OPTS "$H" "ping -c 2 -W 1 ${WORKER_QSFP_IP}" || { echo "QSFP link not reachable"; exit 4; } log " ok — QSFP up, < 1 ms RTT" else log "[3/9] skipping network setup (--skip-network)" fi # ---------------- 4+5. Build + distribute image ---------------- if [[ $SKIP_BUILD -eq 0 ]]; then log "[4-5/9] Building ${IMAGE_TAG} on ${HEAD_HOST} from jasl/vllm@${VLLM_REF} + vendored kylesayrs patch + packed_modules patch..." log " (~25-40 min on a Spark; image ships to worker via docker save | scp | docker load)" ssh $SSH_OPTS "$H" " set -e BUILD_DIR=\$HOME/dsv4-spark-build mkdir -p \$BUILD_DIR && cd \$BUILD_DIR # eugr/spark-vllm-docker provides the build-and-copy.sh wrapper, ccache, # and the standard vllm/flashinfer build steps. if [[ ! -d spark-vllm-docker ]]; then git clone --depth 1 https://github.com/eugr/spark-vllm-docker fi cd spark-vllm-docker # Pull our DSV4-specific Dockerfile + both patches that the Dockerfile expects # to be in the build context. Without kylesayrs-deepseek-ct.patch the Dockerfile # fails at the COPY step with 'kylesayrs-deepseek-ct.patch: not found' (HF #4). curl -fsSL -o Dockerfile '${DSV4_REPO_RAW}/scripts/Dockerfile.dsv4-spark' curl -fsSL -o kylesayrs-deepseek-ct.patch '${DSV4_REPO_RAW}/scripts/kylesayrs-deepseek-ct.patch' curl -fsSL -o patch_v4_packed_mapping.py '${DSV4_REPO_RAW}/scripts/patch_v4_packed_mapping.py' # Build + copy in one shot. ./build-and-copy.sh \ -t '${IMAGE_TAG}' \ --vllm-ref '${VLLM_REF}' \ --rebuild-vllm \ -c '${WORKER_HOST}' \ --full-log " log " build done — image present on both Sparks" else log "[4-5/9] skipping build (--skip-build)" fi # ---------------- 6. Stop existing containers ---------------- log "[6/9] Stopping any existing vllm_node containers..." ssh $SSH_OPTS "$H" 'docker rm -f vllm_node 2>/dev/null || true' ssh $SSH_OPTS "$W" 'docker rm -f vllm_node 2>/dev/null || true' sleep 2 # ---------------- 7+8. Launch ---------------- ENV_FLAGS=( -e "VLLM_TRITON_MLA_SPARSE=1" -e "VLLM_TRITON_MLA_SPARSE_HEAD_BLOCK_SIZE=4" -e "VLLM_RPC_TIMEOUT=600000" -e "VLLM_ENGINE_READY_TIMEOUT_S=3600" -e "TILELANG_CLEANUP_TEMP_FILES=1" -e "HF_HUB_OFFLINE=1" -e "NCCL_IB_DISABLE=0" -e "NCCL_NET_PLUGIN=none" -e "NCCL_IB_SUBNET_AWARE_ROUTING=1" -e "NCCL_IB_MERGE_NICS=0" -e "GLOO_SOCKET_IFNAME=${QSFP_IFNAME}" -e "NCCL_SOCKET_IFNAME=${QSFP_IFNAME}" ) DOCKER_FLAGS=( docker run -d --name vllm_node --gpus all --network=host --ipc=host --ulimit memlock=-1:-1 --ulimit stack=67108864:67108864 -v "\$HOME/.cache/huggingface:/root/.cache/huggingface" ) ENGINE_CMD=( vllm serve canada-quant/DeepSeek-V4-Flash-W4A16-FP8 --served-model-name DSV4-W4A16-FP8 deepseek-ai/DeepSeek-V4-Flash deepseek-v4-flash --trust-remote-code --kv-cache-dtype fp8 --block-size 256 --tokenizer-mode deepseek_v4 --tool-call-parser deepseek_v4 --enable-auto-tool-choice --reasoning-parser deepseek_v4 --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' --max-model-len "${MAX_MODEL_LEN}" --max-num-seqs "${MAX_NUM_SEQS}" --max-num-batched-tokens 8192 --gpu-memory-utilization "${GPU_MEM_UTIL}" --host 0.0.0.0 --port 8888 -tp 2 --nnodes 2 --master-addr "${HEAD_QSFP_IP}" --master-port 29501 ) # Worker first (rank 1, headless) — it waits for the head to broadcast RPC. log "[7/9] Launching worker rank 1 on ${WORKER_HOST} (headless)..." ssh $SSH_OPTS "$W" " ${DOCKER_FLAGS[*]} ${ENV_FLAGS[*]} -e VLLM_HOST_IP=${WORKER_QSFP_IP} \ ${IMAGE_TAG} \ bash -c '$(printf '%q ' "${ENGINE_CMD[@]}" --node-rank 1 --headless)' " sleep 5 log "[8/9] Launching head rank 0 on ${HEAD_HOST} (API server on :8888)..." ssh $SSH_OPTS "$H" " ${DOCKER_FLAGS[*]} ${ENV_FLAGS[*]} -e VLLM_HOST_IP=${HEAD_QSFP_IP} \ ${IMAGE_TAG} \ bash -c '$(printf '%q ' "${ENGINE_CMD[@]}" --node-rank 0)' " # ---------------- 9. Wait for /health ---------------- log "[9/9] Waiting for /health=200 on http://${HEAD_HOST}:8888 (~5-7 min cold start)..." WAITED=0 until ssh $SSH_OPTS "$H" "curl -sf http://localhost:8888/health > /dev/null"; do WAITED=$((WAITED + 30)) if [[ $WAITED -gt 1800 ]]; then echo "engine boot timeout (30 min) — dumping diagnostics from both nodes:" dump_failure_diag head dump_failure_diag worker exit 5 fi HEAD_STATE=$(ssh $SSH_OPTS "$H" 'docker inspect --format "{{.State.Status}}" vllm_node 2>/dev/null || echo missing') WORKER_STATE=$(ssh $SSH_OPTS "$W" 'docker inspect --format "{{.State.Status}}" vllm_node 2>/dev/null || echo missing') if [[ "$HEAD_STATE" == *exited* ]]; then echo "engine container exited on head:" dump_failure_diag head echo "" echo "(also dumping worker state for context — it may have died first and taken head with it)" dump_failure_diag worker exit 6 fi if [[ "$WORKER_STATE" == *exited* ]]; then echo "engine container exited on worker:" dump_failure_diag worker echo "" echo "(also dumping head state for context)" dump_failure_diag head exit 6 fi log " still booting (${WAITED}s elapsed, head=${HEAD_STATE}, worker=${WORKER_STATE})..." sleep 30 done log "" log "========================================================================" log "ENGINE READY ✓" log " endpoint: http://${HEAD_HOST}:8888/v1" log " model names: DSV4-W4A16-FP8 | deepseek-ai/DeepSeek-V4-Flash | deepseek-v4-flash" log " context: ${MAX_MODEL_LEN} tokens, max-num-seqs=${MAX_NUM_SEQS}" log " vllm ref: jasl/vllm@${VLLM_REF}" log " image: ${IMAGE_TAG}" log "========================================================================" # Build provenance — write to a local file for bug reports, also print key fields. PROV_FILE="/tmp/dsv4-spark-build-metadata-$(date +%Y%m%d-%H%M%S).yaml" if ssh $SSH_OPTS "$H" "docker exec vllm_node cat /workspace/build-metadata.yaml" > "$PROV_FILE" 2>/dev/null && [[ -s "$PROV_FILE" ]]; then log "" log "Build provenance written to ${PROV_FILE}:" sed 's/^/ /' "$PROV_FILE" log "" log "If you hit issues, please paste the contents of ${PROV_FILE} with your bug report." else log "" log "(no /workspace/build-metadata.yaml in this image — likely a pre-instrumentation build)" rm -f "$PROV_FILE" fi log "" log "Smoke test:" log "" cat <