[2025-05-23 07:40:11] [INFO] VERL_RUN_SCRIPT_PATH is set to: '/DATA_PATH/grpo.git/verl/qwen_moe_large_mcore.sh' [2025-05-23 07:40:11] [INFO] SLURM_NNODES: 16 [2025-05-23 07:40:11] [INFO] SLURM_JOB_NUM_NODES: 16 [2025-05-23 07:40:11] [INFO] EXPECTED_TOTAL_NODES set to: 16 [2025-05-23 07:40:11] [INFO] Fetching node list... [2025-05-23 07:40:11] [INFO] Node list: [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [HOSTNAME] [2025-05-23 07:40:11] [INFO] Container workdir: /DATA_PATH//verl.git [2025-05-23 07:40:11] [INFO] Head node designated: [HOSTNAME] [2025-05-23 07:40:11] [INFO] Fetching IP address for head node [HOSTNAME]... [2025-05-23 07:40:11] [INFO] Raw IP address for [HOSTNAME]: '[PRIVATE_IP]' [2025-05-23 07:40:11] [INFO] Ray head address configured: [PRIVATE_IP]:6379 (Port: 6379) SHELL=/bin/bash SLURM_GPUS_PER_NODE=8 UV_CACHE_DIR=/DATA_PATH//cache/uv COLORTERM=truecolor SLURM_JOB_USER=USER SLURM_TASKS_PER_NODE=1(x16) SLURM_JOB_UID=20001 EXPECTED_TOTAL_NODES=16 SLURM_EXPORT_ENV=ALL,VERL_RUN_SCRIPT_PATH=/DATA_PATH/grpo.git/verl/qwen_moe_large_mcore.sh TERM_PROGRAM_VERSION=0.50.5 SLURM_TASK_PID=2182693 SLURM_JOB_GPUS=0,1,2,3,4,5,6,7 SLURM_LOCALID=0 SLURM_SUBMIT_DIR=/DATA_PATH HOSTNAME=[HOSTNAME] SLURMD_NODENAME=[HOSTNAME] SLURM_JOB_START_TIME=1747986008 HYDRA_LAUNCHER_EXTRA_ARGS=--external-launcher SLURM_CLUSTER_NAME=ovis SLURM_JOB_END_TIME=1755762011 BASE_DIR=/DATA_PATH/ SLURM_CPUS_ON_NODE=96 SLURM_JOB_CPUS_PER_NODE=96(x16) VLLM_CACHE_ROOT=/DATA_PATH//cache/vllm SLURM_GPUS_ON_NODE=8 PRTE_MCA_plm_slurm_args=--external-launcher PWD=/DATA_PATH SLURM_GTIDS=0 LOGNAME=USER XDG_SESSION_TYPE=tty SLURM_JOB_PARTITION=hpc MODULESHOME=/usr/share/modules CYCLECLOUD_HOME=/opt/cycle/jetpack MANPATH=: ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 NCCL_DEBUG=INFO SLURM_JOB_NUM_NODES=16 SLURM_JOBID=261 I_MPI_HYDRA_BOOTSTRAP_EXEC_EXTRA_ARGS=--external-launcher IP_HEAD=[PRIVATE_IP]:6379 MOTD_SHOWN=pam HEAD_PORT=6379 HOME=/HOME LANG=C.UTF-8 LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: SLURM_PROCID=0 SSL_CERT_DIR=/usr/lib/ssl/certs TMPDIR=/tmp SLURM_NTASKS=16 SLURM_TOPOLOGY_ADDR=[HOSTNAME] SSH_CONNECTION=[PRIVATE_IP] 38008 [PRIVATE_IP] 22 TRITON_CACHE_DIR=/DATA_PATH//cache/triton ZE_AFFINITY_MASK=0,1,2,3,4,5,6,7 HYDRA_BOOTSTRAP=slurm SLURM_TOPOLOGY_ADDR_PATTERN=node CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 SGX_AESM_ADDR=1 HF_HOME=/DATA_PATH//cache/hf LESSCLOSE=/usr/bin/lesspipe %s %s XDG_SESSION_CLASS=user SLURM_MEM_PER_NODE=1740800 TERM=xterm-256color ZES_ENABLE_SYSMAN=1 TORCH_NCCL_ASYNC_ERROR_HANDLING=1 LESSOPEN=| /usr/bin/lesspipe %s USER=USER SLURM_NODELIST=ovis-hpc-[1-16] VSCODE_GIT_IPC_HANDLE=/run/user/20001/vscode-git-dc0c74a15a.sock ENVIRONMENT=BATCH GPU_DEVICE_ORDINAL=0,1,2,3,4,5,6,7 LOADEDMODULES= SLURM_PRIO_PROCESS=0 SLURM_NPROCS=16 NCCL_IB_TIMEOUT=24 SHLVL=3 SLURM_NNODES=16 CUDA_DEVICE_MAX_CONNECTIONS=1 XDG_SESSION_ID=1310 SLURM_SUBMIT_HOST=ovis-login-1 ROLLOUT_NAME=vllm LD_LIBRARY_PATH=:/usr/local/cuda/lib64:/usr/local/cuda/lib64 XDG_RUNTIME_DIR=/run/user/20001 SLURM_JOB_ID=261 SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt SLURM_NODEID=0 FLASHINFER_WORKSPACE_BASE=/DATA_PATH//cache/flashinfer SSH_CLIENT=[PRIVATE_IP] 38008 22 XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop SLURM_CONF=/etc/slurm/slurm.conf BROWSER=/HOME/.cursor-server/cli/servers/Stable-96e5b01ca25f8fbd4c4c10bc69b15f6228c80770/server/bin/helpers/browser.sh PATH=/HOME/.cursor-server/cli/servers/Stable-96e5b01ca25f8fbd4c4c10bc69b15f6228c80770/server/bin/remote-cli:/HOME/.local/bin:/HOME/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/opt/cycle/jetpack/bin:/usr/local/cuda/bin:/opt/cycle/jetpack/bin:/HOME/.fzf/bin:/usr/local/cuda/bin SLURM_JOB_NAME=qwen-235b-grpo-megatron-charlie MODULEPATH=/etc/environment-modules/modules:/usr/share/modules/versions:/usr/share/modules/$MODULE_VERSION/modulefiles:/usr/share/modules/modulefiles CYCLECLOUD_BOOTSTRAP=/opt/cycle/jetpack/system/bootstrap SLURM_NTASKS_PER_NODE=1 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/20001/bus VERL_RUN_SCRIPT_PATH=/DATA_PATH/grpo.git/verl/qwen_moe_large_mcore.sh MAIL=/var/mail/USER OMPI_MCA_plm_slurm_args=--external-launcher SLURM_JOB_GID=20001 SLURM_GET_USER_ENV=1 CONFIG_DIR_PATH=/DATA_PATH//grpo.git/verl/ OLDPWD=/DATA_PATH/verl.git HEAD_NODE_IP=[PRIVATE_IP] NCCL_TOPO_FILE=/opt/microsoft/ndv5-topo.xml SLURM_JOB_NODELIST=ovis-hpc-[1-16] MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl TERM_PROGRAM=vscode I_MPI_HYDRA_BOOTSTRAP=slurm VSCODE_IPC_HOOK_CLI=/run/user/20001/vscode-ipc-0b3c3ab6-0b58-4e2c-a266-c35b490356d4.sock BASH_FUNC_ml%%=() { module ml "$@" } BASH_FUNC_module%%=() { eval `/usr/bin/tclsh8.6 /usr/lib/x86_64-linux-gnu/modulecmd.tcl bash "$@"`; _mlstatus=$?; return $_mlstatus } _=/usr/bin/printenv [2025-05-23 07:40:11] [INFO] Starting HEAD node script at [HOSTNAME] which will manage Ray head, wait for workers, and launch the application. [2025-05-23 07:40:11] [INFO] Number of worker nodes to start: 15 Starting WORKER 1 at [HOSTNAME] Starting WORKER 2 at [HOSTNAME] Starting WORKER 3 at [HOSTNAME] Starting WORKER 4 at [HOSTNAME] Starting WORKER 5 at [HOSTNAME] Starting WORKER 6 at [HOSTNAME] Starting WORKER 7 at [HOSTNAME] Starting WORKER 8 at [HOSTNAME] Starting WORKER 9 at [HOSTNAME] Starting WORKER 10 at [HOSTNAME] Starting WORKER 11 at [HOSTNAME] Starting WORKER 12 at [HOSTNAME] Starting WORKER 13 at [HOSTNAME] Starting WORKER 14 at [HOSTNAME] Starting WORKER 15 at [HOSTNAME] Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... [$(date '+%Y-%m-%d %H:%M:%S')] [INFO] [srun-head-bootstrap] Current directory in container: $(pwd). Running script: /DATA_PATH/grpo.git/verl/qwen_moe_large_mcore.sh Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... Current directory: /DATA_PATH/verl.git. Installing project in editable mode with vllm extras from /DATA_PATH/verl.git on worker [HOSTNAME]... [2025-05-23 07:40:35] [INFO] [run_head_node.sh] Current directory: /DATA_PATH/verl.git [2025-05-23 07:40:35] [INFO] [run_head_node.sh] Installing project in editable mode with vllm extras... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... Installation complete. Starting Ray worker node on [HOSTNAME]... [2025-05-23 07:40:53] [INFO] [run_head_node.sh] Installation complete. [2025-05-23 07:40:53] [INFO] [run_head_node.sh] Starting Ray head node on [PRIVATE_IP]:6379 with 8 GPUs... [2025-05-23 07:40:53] [INFO] [run_head_node.sh] Ray head started in background with PID 2184280. [2025-05-23 07:40:53] [INFO] [run_head_node.sh] Waiting for Ray head node at [PRIVATE_IP]:6379 to be ready... [2025-05-23 07:40:55] [INFO] [run_head_node.sh] Ray head not ready yet (Attempt 1/60). Retrying in 10s... 2025-05-23 07:40:52,473 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:55,406 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:55,406 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:55,406 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:55,406 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:55,406 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:55,407 INFO scripts.py:1181 -- --block 2025-05-23 07:40:55,407 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:55,407 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,190 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:56,422 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:56,422 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:56,422 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:56,422 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:56,422 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:56,423 INFO scripts.py:1181 -- --block 2025-05-23 07:40:56,423 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:56,423 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,721 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:56,518 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:56,519 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:56,519 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:56,519 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:56,519 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:56,519 INFO scripts.py:1181 -- --block 2025-05-23 07:40:56,519 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:56,519 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:51,828 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:56,791 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:56,791 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:56,791 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:56,791 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:56,791 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:56,791 INFO scripts.py:1181 -- --block 2025-05-23 07:40:56,791 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:56,791 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,002 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:57,129 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:57,129 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:57,129 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:57,129 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:57,129 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:57,129 INFO scripts.py:1181 -- --block 2025-05-23 07:40:57,129 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:57,129 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,189 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:57,530 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:57,531 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:57,531 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:57,531 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:57,531 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:57,531 INFO scripts.py:1181 -- --block 2025-05-23 07:40:57,531 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:57,531 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,086 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:57,560 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:57,561 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:57,561 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:57,561 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:57,561 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:57,561 INFO scripts.py:1181 -- --block 2025-05-23 07:40:57,561 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:57,561 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:51,938 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:57,704 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:57,705 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:57,705 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:57,705 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:57,705 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:57,705 INFO scripts.py:1181 -- --block 2025-05-23 07:40:57,705 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:57,705 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,097 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:57,704 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:57,704 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:57,704 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:57,704 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:57,704 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:57,704 INFO scripts.py:1181 -- --block 2025-05-23 07:40:57,704 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:57,704 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:51,905 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:57,804 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:57,804 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:57,804 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:57,804 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:57,804 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:57,805 INFO scripts.py:1181 -- --block 2025-05-23 07:40:57,805 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:57,805 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,386 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:57,807 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:57,807 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:57,807 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:57,808 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:57,808 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:57,808 INFO scripts.py:1181 -- --block 2025-05-23 07:40:57,808 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:57,808 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:54,911 INFO usage_lib.py:467 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details. 2025-05-23 07:40:54,911 INFO scripts.py:971 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:57,858 SUCC scripts.py:1007 -- -------------------- 2025-05-23 07:40:57,858 SUCC scripts.py:1008 -- Ray runtime started. 2025-05-23 07:40:57,858 SUCC scripts.py:1009 -- -------------------- 2025-05-23 07:40:57,859 INFO scripts.py:1011 -- Next steps 2025-05-23 07:40:57,859 INFO scripts.py:1014 -- To add another node to this Ray cluster, run 2025-05-23 07:40:57,859 INFO scripts.py:1017 --  ray start --address='[PRIVATE_IP]:6379' 2025-05-23 07:40:57,859 INFO scripts.py:1026 -- To connect to this Ray cluster: 2025-05-23 07:40:57,859 INFO scripts.py:1028 -- import ray 2025-05-23 07:40:57,859 INFO scripts.py:1029 -- ray.init(_node_ip_address='[PRIVATE_IP]') 2025-05-23 07:40:57,859 INFO scripts.py:1041 -- To submit a Ray job using the Ray Jobs CLI: 2025-05-23 07:40:57,859 INFO scripts.py:1042 --  RAY_ADDRESS='http://127.0.0.1:8265' ray job submit --working-dir . -- python my_script.py 2025-05-23 07:40:57,859 INFO scripts.py:1051 -- See https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html 2025-05-23 07:40:57,859 INFO scripts.py:1055 -- for more information on submitting Ray jobs to the Ray cluster. 2025-05-23 07:40:57,859 INFO scripts.py:1060 -- To terminate the Ray runtime, run 2025-05-23 07:40:57,859 INFO scripts.py:1061 --  ray stop 2025-05-23 07:40:57,859 INFO scripts.py:1064 -- To view the status of the cluster, use 2025-05-23 07:40:57,859 INFO scripts.py:1065 -- ray status 2025-05-23 07:40:57,859 INFO scripts.py:1069 -- To monitor and debug Ray, view the dashboard at 2025-05-23 07:40:57,859 INFO scripts.py:1070 -- 127.0.0.1:8265 2025-05-23 07:40:57,859 INFO scripts.py:1077 -- If connection to the dashboard fails, check your firewall settings and network configuration. 2025-05-23 07:40:57,859 INFO scripts.py:1181 -- --block 2025-05-23 07:40:57,859 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:57,859 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,393 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:58,284 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:58,285 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:58,285 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:58,285 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:58,285 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:58,285 INFO scripts.py:1181 -- --block 2025-05-23 07:40:58,285 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:58,285 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,522 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:58,292 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:58,292 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:58,292 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:58,292 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:58,292 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:58,292 INFO scripts.py:1181 -- --block 2025-05-23 07:40:58,292 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:58,292 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,355 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:58,725 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:58,726 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:58,726 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:58,726 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:58,726 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:58,726 INFO scripts.py:1181 -- --block 2025-05-23 07:40:58,726 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:58,726 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. 2025-05-23 07:40:52,418 INFO scripts.py:1152 -- Local node IP: [PRIVATE_IP] 2025-05-23 07:40:59,290 SUCC scripts.py:1168 -- -------------------- 2025-05-23 07:40:59,290 SUCC scripts.py:1169 -- Ray runtime started. 2025-05-23 07:40:59,290 SUCC scripts.py:1170 -- -------------------- 2025-05-23 07:40:59,290 INFO scripts.py:1172 -- To terminate the Ray runtime, run 2025-05-23 07:40:59,290 INFO scripts.py:1173 --  ray stop 2025-05-23 07:40:59,291 INFO scripts.py:1181 -- --block 2025-05-23 07:40:59,291 INFO scripts.py:1182 -- This command will now block forever until terminated by a signal. 2025-05-23 07:40:59,291 INFO scripts.py:1185 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported. [2025-05-23 07:41:06] [INFO] [run_head_node.sh] Ray head node at [PRIVATE_IP]:6379 is ready. [2025-05-23 07:41:06] [INFO] [run_head_node.sh] Waiting for 16 total nodes to join the Ray cluster (including head)... [2025-05-23 07:41:08] [INFO] [run_head_node.sh] Sufficient number of nodes (16/16) detected in the Ray cluster. [2025-05-23 07:41:08] [INFO] [run_head_node.sh] All 16 nodes are ready in the Ray cluster. [2025-05-23 07:41:08] [INFO] [run_head_node.sh] Launching main PPO training script... (TaskRunner pid=2191279) {'actor_rollout_ref': {'actor': {'checkpoint': {'contents': ['model', 'extra']}, (TaskRunner pid=2191279) 'clip_ratio': 0.2, (TaskRunner pid=2191279) 'clip_ratio_c': 3.0, (TaskRunner pid=2191279) 'clip_ratio_high': 0.2, (TaskRunner pid=2191279) 'clip_ratio_low': 0.2, (TaskRunner pid=2191279) 'data_loader_seed': None, (TaskRunner pid=2191279) 'entropy_coeff': 0, (TaskRunner pid=2191279) 'kl_loss_coef': 0.001, (TaskRunner pid=2191279) 'kl_loss_type': 'low_var_kl', (TaskRunner pid=2191279) 'load_weight': True, (TaskRunner pid=2191279) 'loss_agg_mode': 'token-mean', (TaskRunner pid=2191279) 'megatron': {'context_parallel_size': 1, (TaskRunner pid=2191279) 'dist_checkpointing_path': '/DATA_PATH//models/Qwen3-235B-A22B-mcore', (TaskRunner pid=2191279) 'expert_model_parallel_size': 32, (TaskRunner pid=2191279) 'expert_tensor_parallel_size': 1, (TaskRunner pid=2191279) 'grad_offload': True, (TaskRunner pid=2191279) 'optimizer_offload': True, (TaskRunner pid=2191279) 'param_offload': True, (TaskRunner pid=2191279) 'pipeline_model_parallel_size': 2, (TaskRunner pid=2191279) 'seed': 1, (TaskRunner pid=2191279) 'sequence_parallel': True, (TaskRunner pid=2191279) 'tensor_model_parallel_size': 2, (TaskRunner pid=2191279) 'use_dist_checkpointing': True, (TaskRunner pid=2191279) 'use_distributed_optimizer': True, (TaskRunner pid=2191279) 'virtual_pipeline_model_parallel_size': None}, (TaskRunner pid=2191279) 'optim': {'clip_grad': 1.0, (TaskRunner pid=2191279) 'lr': 1e-06, (TaskRunner pid=2191279) 'lr_warmup_steps': -1, (TaskRunner pid=2191279) 'lr_warmup_steps_ratio': 0.0, (TaskRunner pid=2191279) 'min_lr_ratio': None, (TaskRunner pid=2191279) 'total_training_steps': -1, (TaskRunner pid=2191279) 'warmup_style': 'constant', (TaskRunner pid=2191279) 'weight_decay': 0.01}, (TaskRunner pid=2191279) 'ppo_epochs': 1, (TaskRunner pid=2191279) 'ppo_micro_batch_size': None, (TaskRunner pid=2191279) 'ppo_micro_batch_size_per_gpu': 1, (TaskRunner pid=2191279) 'ppo_mini_batch_size': 128, (TaskRunner pid=2191279) 'profile': {'profile_ranks': None, (TaskRunner pid=2191279) 'save_path': None, (TaskRunner pid=2191279) 'step_end': -1, (TaskRunner pid=2191279) 'step_start': -1, (TaskRunner pid=2191279) 'use_profile': False}, (TaskRunner pid=2191279) 'shuffle': False, (TaskRunner pid=2191279) 'strategy': 'megatron', (TaskRunner pid=2191279) 'use_dynamic_bsz': False, (TaskRunner pid=2191279) 'use_kl_loss': True, (TaskRunner pid=2191279) 'use_torch_compile': True}, (TaskRunner pid=2191279) 'hybrid_engine': True, (TaskRunner pid=2191279) 'model': {'enable_gradient_checkpointing': True, (TaskRunner pid=2191279) 'external_lib': None, (TaskRunner pid=2191279) 'gradient_checkpointing_kwargs': {'activations_checkpoint_granularity': None, (TaskRunner pid=2191279) 'activations_checkpoint_method': None, (TaskRunner pid=2191279) 'activations_checkpoint_num_layers': None}, (TaskRunner pid=2191279) 'override_config': {}, (TaskRunner pid=2191279) 'path': '/data/base_models/Qwen3-235B-A22B'}, (TaskRunner pid=2191279) 'ref': {'load_weight': True, (TaskRunner pid=2191279) 'log_prob_micro_batch_size': None, (TaskRunner pid=2191279) 'log_prob_micro_batch_size_per_gpu': 1, (TaskRunner pid=2191279) 'megatron': {'context_parallel_size': 1, (TaskRunner pid=2191279) 'dist_checkpointing_path': '/DATA_PATH//models/Qwen3-235B-A22B-mcore', (TaskRunner pid=2191279) 'expert_model_parallel_size': 32, (TaskRunner pid=2191279) 'expert_tensor_parallel_size': 1, (TaskRunner pid=2191279) 'param_offload': True, (TaskRunner pid=2191279) 'pipeline_model_parallel_size': 2, (TaskRunner pid=2191279) 'seed': 1, (TaskRunner pid=2191279) 'sequence_parallel': True, (TaskRunner pid=2191279) 'tensor_model_parallel_size': 2, (TaskRunner pid=2191279) 'use_dist_checkpointing': True, (TaskRunner pid=2191279) 'use_distributed_optimizer': False, (TaskRunner pid=2191279) 'virtual_pipeline_model_parallel_size': None}, (TaskRunner pid=2191279) 'profile': {'profile_ranks': None, (TaskRunner pid=2191279) 'save_path': None, (TaskRunner pid=2191279) 'step_end': -1, (TaskRunner pid=2191279) 'step_start': -1, (TaskRunner pid=2191279) 'use_profile': False}, (TaskRunner pid=2191279) 'strategy': 'megatron', (TaskRunner pid=2191279) 'use_torch_compile': True}, (TaskRunner pid=2191279) 'rollout': {'chat_scheduler': None, (TaskRunner pid=2191279) 'disable_log_stats': True, (TaskRunner pid=2191279) 'do_sample': True, (TaskRunner pid=2191279) 'dtype': 'bfloat16', (TaskRunner pid=2191279) 'enable_chunked_prefill': False, (TaskRunner pid=2191279) 'enforce_eager': True, (TaskRunner pid=2191279) 'engine_kwargs': {'swap_space': 32}, (TaskRunner pid=2191279) 'free_cache_engine': True, (TaskRunner pid=2191279) 'gpu_memory_utilization': 0.75, (TaskRunner pid=2191279) 'ignore_eos': False, (TaskRunner pid=2191279) 'layer_name_map': {'gate_proj_layer_name': 'gate_up', (TaskRunner pid=2191279) 'qkv_layer_name': 'qkv'}, (TaskRunner pid=2191279) 'load_format': 'dummy_megatron', (TaskRunner pid=2191279) 'log_prob_micro_batch_size': None, (TaskRunner pid=2191279) 'log_prob_micro_batch_size_per_gpu': 1, (TaskRunner pid=2191279) 'max_model_len': 32768, (TaskRunner pid=2191279) 'max_num_batched_tokens': 4096, (TaskRunner pid=2191279) 'max_num_seqs': 1, (TaskRunner pid=2191279) 'mode': 'sync', (TaskRunner pid=2191279) 'multi_turn': {'enable': False, (TaskRunner pid=2191279) 'format': 'chatml', (TaskRunner pid=2191279) 'max_turns': None, (TaskRunner pid=2191279) 'tool_config_path': None}, (TaskRunner pid=2191279) 'n': 8, (TaskRunner pid=2191279) 'name': 'vllm', (TaskRunner pid=2191279) 'prompt_length': 32768, (TaskRunner pid=2191279) 'response_length': 4096, (TaskRunner pid=2191279) 'temperature': 1.0, (TaskRunner pid=2191279) 'tensor_model_parallel_size': 8, (TaskRunner pid=2191279) 'top_k': -1, (TaskRunner pid=2191279) 'top_p': 1, (TaskRunner pid=2191279) 'val_kwargs': {'do_sample': False, (TaskRunner pid=2191279) 'n': 1, (TaskRunner pid=2191279) 'temperature': 0, (TaskRunner pid=2191279) 'top_k': -1, (TaskRunner pid=2191279) 'top_p': 1.0}}}, (TaskRunner pid=2191279) 'algorithm': {'adv_estimator': 'grpo', (TaskRunner pid=2191279) 'gamma': 1.0, (TaskRunner pid=2191279) 'kl_ctrl': {'horizon': 10000, (TaskRunner pid=2191279) 'kl_coef': 0.001, (TaskRunner pid=2191279) 'target_kl': 0.1, (TaskRunner pid=2191279) 'type': 'fixed'}, (TaskRunner pid=2191279) 'kl_penalty': 'kl', (TaskRunner pid=2191279) 'lam': 1.0, (TaskRunner pid=2191279) 'norm_adv_by_std_in_grpo': True, (TaskRunner pid=2191279) 'use_kl_in_reward': True}, (TaskRunner pid=2191279) 'critic': {'checkpoint': {'contents': ['model', 'optimizer', 'extra']}, (TaskRunner pid=2191279) 'cliprange_value': 0.5, (TaskRunner pid=2191279) 'data_loader_seed': None, (TaskRunner pid=2191279) 'kl_ctrl': {'kl_coef': 0.001, 'type': 'fixed'}, (TaskRunner pid=2191279) 'load_weight': True, (TaskRunner pid=2191279) 'megatron': {'context_parallel_size': 1, (TaskRunner pid=2191279) 'dist_checkpointing_path': None, (TaskRunner pid=2191279) 'expert_model_parallel_size': 1, (TaskRunner pid=2191279) 'expert_tensor_parallel_size': 'None', (TaskRunner pid=2191279) 'grad_offload': False, (TaskRunner pid=2191279) 'optimizer_offload': False, (TaskRunner pid=2191279) 'param_offload': False, (TaskRunner pid=2191279) 'pipeline_model_parallel_size': 1, (TaskRunner pid=2191279) 'seed': 1, (TaskRunner pid=2191279) 'sequence_parallel': True, (TaskRunner pid=2191279) 'tensor_model_parallel_size': 1, (TaskRunner pid=2191279) 'use_dist_checkpointing': False, (TaskRunner pid=2191279) 'use_distributed_optimizer': True, (TaskRunner pid=2191279) 'virtual_pipeline_model_parallel_size': None}, (TaskRunner pid=2191279) 'model': {'enable_gradient_checkpointing': False, (TaskRunner pid=2191279) 'external_lib': None, (TaskRunner pid=2191279) 'gradient_checkpointing_kwargs': {'activations_checkpoint_granularity': None, (TaskRunner pid=2191279) 'activations_checkpoint_method': None, (TaskRunner pid=2191279) 'activations_checkpoint_num_layers': None}, (TaskRunner pid=2191279) 'override_config': {}, (TaskRunner pid=2191279) 'path': 'Qwen/Qwen3-4B', (TaskRunner pid=2191279) 'tokenizer_path': '/data/base_models/Qwen3-235B-A22B'}, (TaskRunner pid=2191279) 'optim': {'clip_grad': 1.0, (TaskRunner pid=2191279) 'lr': 1e-05, (TaskRunner pid=2191279) 'lr_warmup_steps_ratio': 0.0, (TaskRunner pid=2191279) 'min_lr_ratio': None, (TaskRunner pid=2191279) 'total_training_steps': -1, (TaskRunner pid=2191279) 'warmup_style': 'constant', (TaskRunner pid=2191279) 'weight_decay': 0.01}, (TaskRunner pid=2191279) 'ppo_epochs': 1, (TaskRunner pid=2191279) 'ppo_micro_batch_size': None, (TaskRunner pid=2191279) 'ppo_micro_batch_size_per_gpu': None, (TaskRunner pid=2191279) 'ppo_mini_batch_size': 128, (TaskRunner pid=2191279) 'rollout_n': 8, (TaskRunner pid=2191279) 'shuffle': False, (TaskRunner pid=2191279) 'strategy': 'megatron', (TaskRunner pid=2191279) 'use_dynamic_bsz': False}, (TaskRunner pid=2191279) 'custom_reward_function': {'name': 'compute_score', 'path': None}, (TaskRunner pid=2191279) 'data': {'custom_cls': {'name': None, 'path': None}, (TaskRunner pid=2191279) 'filter_overlong_prompts': True, (TaskRunner pid=2191279) 'filter_overlong_prompts_workers': 1, (TaskRunner pid=2191279) 'max_prompt_length': 32768, (TaskRunner pid=2191279) 'max_response_length': 4096, (TaskRunner pid=2191279) 'prompt_key': 'prompt', (TaskRunner pid=2191279) 'return_raw_chat': False, (TaskRunner pid=2191279) 'return_raw_input_ids': False, (TaskRunner pid=2191279) 'reward_fn_key': 'data_source', (TaskRunner pid=2191279) 'shuffle': True, (TaskRunner pid=2191279) 'tokenizer': None, (TaskRunner pid=2191279) 'train_batch_size': 128, (TaskRunner pid=2191279) 'train_files': ['/DATA_PATH//data/gsm8k/train.parquet'], (TaskRunner pid=2191279) 'truncation': 'error', (TaskRunner pid=2191279) 'val_batch_size': None, (TaskRunner pid=2191279) 'val_files': ['/DATA_PATH//data/gsm8k/test.parquet']}, (TaskRunner pid=2191279) 'ray_init': {'num_cpus': None}, (TaskRunner pid=2191279) 'reward_model': {'enable': False, (TaskRunner pid=2191279) 'launch_reward_fn_async': False, (TaskRunner pid=2191279) 'load_weight': True, (TaskRunner pid=2191279) 'max_length': None, (TaskRunner pid=2191279) 'megatron': {'context_parallel_size': 1, (TaskRunner pid=2191279) 'dist_checkpointing_path': None, (TaskRunner pid=2191279) 'expert_model_parallel_size': 1, (TaskRunner pid=2191279) 'expert_tensor_parallel_size': 'None', (TaskRunner pid=2191279) 'grad_offload': False, (TaskRunner pid=2191279) 'optimizer_offload': False, (TaskRunner pid=2191279) 'param_offload': False, (TaskRunner pid=2191279) 'pipeline_model_parallel_size': 1, (TaskRunner pid=2191279) 'seed': 1, (TaskRunner pid=2191279) 'sequence_parallel': True, (TaskRunner pid=2191279) 'tensor_model_parallel_size': 1, (TaskRunner pid=2191279) 'use_dist_checkpointing': False, (TaskRunner pid=2191279) 'use_distributed_optimizer': False, (TaskRunner pid=2191279) 'virtual_pipeline_model_parallel_size': None}, (TaskRunner pid=2191279) 'micro_batch_size': None, (TaskRunner pid=2191279) 'micro_batch_size_per_gpu': None, (TaskRunner pid=2191279) 'model': {'external_lib': None, (TaskRunner pid=2191279) 'input_tokenizer': '/data/base_models/Qwen3-235B-A22B', (TaskRunner pid=2191279) 'path': '~/models/FsfairX-LLaMA3-RM-v0.1'}, (TaskRunner pid=2191279) 'sandbox_fusion': {'max_concurrent': 64, 'url': None}, (TaskRunner pid=2191279) 'strategy': 'megatron', (TaskRunner pid=2191279) 'use_dynamic_bsz': False}, (TaskRunner pid=2191279) 'trainer': {'balance_batch': True, (TaskRunner pid=2191279) 'critic_warmup': 0, (TaskRunner pid=2191279) 'default_hdfs_dir': None, (TaskRunner pid=2191279) 'default_local_dir': '/DATA_PATH//models/verl_grpo_megatron_gsm8k/qwen3_235b22b_moe_mcore_tp_etp_not_same/v20250523_074035', (TaskRunner pid=2191279) 'del_local_ckpt_after_load': False, (TaskRunner pid=2191279) 'experiment_name': 'qwen3_235b22b_moe_mcore_tp_etp_not_same', (TaskRunner pid=2191279) 'log_val_generations': 0, (TaskRunner pid=2191279) 'logger': ['console', 'tensorboard'], (TaskRunner pid=2191279) 'max_actor_ckpt_to_keep': 1, (TaskRunner pid=2191279) 'max_critic_ckpt_to_keep': 1, (TaskRunner pid=2191279) 'n_gpus_per_node': 8, (TaskRunner pid=2191279) 'nnodes': 16, (TaskRunner pid=2191279) 'project_name': 'verl_grpo_megatron_gsm8k', (TaskRunner pid=2191279) 'ray_wait_register_center_timeout': 300, (TaskRunner pid=2191279) 'resume_from_path': None, (TaskRunner pid=2191279) 'resume_mode': 'auto', (TaskRunner pid=2191279) 'save_freq': 10, (TaskRunner pid=2191279) 'test_freq': 5, (TaskRunner pid=2191279) 'total_epochs': 5, (TaskRunner pid=2191279) 'total_training_steps': None, (TaskRunner pid=2191279) 'val_before_train': True}} (TaskRunner pid=2191279) Using dataset class: RLHFDataset (TaskRunner pid=2191279) dataset len: 7473 (TaskRunner pid=2191279) filter dataset len: 7473 (TaskRunner pid=2191279) Using dataset class: RLHFDataset (TaskRunner pid=2191279) dataset len: 1319 (TaskRunner pid=2191279) filter dataset len: 1319 (TaskRunner pid=2191279) NOTICE: You have both enabled in-reward kl and kl loss. (TaskRunner pid=2191279) [validate_config] All configuration checks passed successfully! (TaskRunner pid=2191279) Size of train dataloader: 58, Size of val dataloader: 1 (TaskRunner pid=2191279) Total training steps: 290 (TaskRunner pid=2191279) colocated worker base class (WorkerDict pid=1691639, ip=[PRIVATE_IP]) Overridden TF init config: {'num_layers': 94, 'hidden_size': 4096, 'num_attention_heads': 64, 'num_query_groups': 4, 'ffn_hidden_size': 12288, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'kv_channels': 128, 'layernorm_epsilon': 1e-06, 'activation_func': , 'normalization': 'RMSNorm', 'gated_linear_unit': True, 'pipeline_dtype': torch.bfloat16, 'params_dtype': torch.bfloat16, 'bf16': True, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 2, 'expert_model_parallel_size': 32, 'expert_tensor_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'overlap_p2p_comm': False, 'batch_p2p_comm': False, 'sequence_parallel': True, 'variable_seq_lengths': True, 'masked_softmax_fusion': True, 'moe_token_dispatcher_type': 'alltoall', 'use_cpu_initialization': False, 'add_bias_linear': False, 'moe_ffn_hidden_size': 1536, 'moe_router_bias_update_rate': 0.001, 'moe_router_topk': 8, 'num_moe_experts': 128, 'moe_aux_loss_coeff': 0.001, 'moe_router_load_balancing_type': 'aux_loss', 'moe_grouped_gemm': True, 'moe_router_score_function': 'softmax', 'persist_layer_norm': True, 'bias_activation_fusion': True, 'bias_dropout_fusion': True, 'moe_router_pre_softmax': False, 'qk_layernorm': True} (WorkerDict pid=1691639, ip=[PRIVATE_IP]) TF config: TransformerConfig(tensor_model_parallel_size=2, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=32, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=torch.bfloat16, variable_seq_lengths=True, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=2, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=94, mtp_num_layers=None, mtp_loss_scaling_factor=None, num_layers_in_first_pipeline_stage=None, num_layers_in_last_pipeline_stage=None, account_for_embedding_in_pipeline_split=False, account_for_loss_in_pipeline_split=False, hidden_size=4096, num_attention_heads=64, attention_backend=, softmax_scale=None, num_query_groups=4, ffn_hidden_size=12288, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=False, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=128, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=True, test_mode=False, calculate_per_token_loss=False, multi_latent_attention=False, init_method=functools.partial(, mean=0.0, std=0.02), output_layer_init_method=functools.partial(, mean=0.0, std=0.0014586499149789457), init_method_std=0.02, init_model_with_meta_device=False, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, disable_bf16_reduced_precision_matmul=False, bias_activation_fusion=True, masked_softmax_fusion=True, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=None, fp8=None, fp8_recipe='delayed', fp8_param=False, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, tp_only_amax_red=False, first_last_layers_bf16=False, num_layers_at_start_in_bf16=1, num_layers_at_end_in_bf16=1, moe_shared_expert_intermediate_size=None, moe_shared_expert_overlap=False, moe_layer_freq=1, moe_ffn_hidden_size=1536, moe_router_load_balancing_type='aux_loss', moe_router_topk=8, moe_router_topk_limited_devices=None, moe_router_num_groups=None, moe_router_group_topk=None, moe_router_pre_softmax=False, moe_router_topk_scaling_factor=None, moe_router_score_function='softmax', moe_router_dtype=None, moe_router_enable_expert_bias=False, moe_router_bias_update_rate=0.001, moe_grouped_gemm=True, moe_use_legacy_grouped_gemm=False, moe_aux_loss_coeff=0.001, moe_z_loss_coeff=None, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='alltoall', moe_enable_deepep=False, moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, moe_permute_fusion=False, cp_comm_type=None, enable_cuda_graph=False, cuda_graph_use_single_mempool=False, cuda_graph_retain_backward_graph=False, cuda_graph_warmup_steps=3, external_cuda_graph=False, cuda_graph_scope='full', clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, config_logger_dir='', flash_decode=False, inference_rng_tracker=False, mrope_section=None, use_custom_fsdp=False, is_hybrid_model=False, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8) (WorkerDict pid=1691639, ip=[PRIVATE_IP]) self.config.ref.load_weight: True (WorkerDict pid=664049, ip=[PRIVATE_IP]) Model config after override: Qwen3MoeConfig { (WorkerDict pid=664049, ip=[PRIVATE_IP]) "architectures": [ (WorkerDict pid=664049, ip=[PRIVATE_IP]) "Qwen3MoeForCausalLM" (WorkerDict pid=664049, ip=[PRIVATE_IP]) ], (WorkerDict pid=664049, ip=[PRIVATE_IP]) "attention_bias": false, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "attention_dropout": 0.0, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "decoder_sparse_step": 1, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "eos_token_id": 151645, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "head_dim": 128, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "hidden_act": "silu", (WorkerDict pid=664049, ip=[PRIVATE_IP]) "hidden_size": 4096, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "initializer_range": 0.02, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "intermediate_size": 12288, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "max_position_embeddings": 40960, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "max_window_layers": 94, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "mlp_only_layers": [], (WorkerDict pid=664049, ip=[PRIVATE_IP]) "model_type": "qwen3_moe", (WorkerDict pid=664049, ip=[PRIVATE_IP]) "moe_intermediate_size": 1536, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "norm_topk_prob": true, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_attention_heads": 64, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_experts": 128, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_experts_per_tok": 8, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_hidden_layers": 94, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_key_value_heads": 4, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "output_router_logits": false, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "pad_token_id": 151643, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "rms_norm_eps": 1e-06, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "rope_scaling": null, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "rope_theta": 1000000.0, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "router_aux_loss_coef": 0.001, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "sliding_window": null, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "tie_word_embeddings": false, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "torch_dtype": "bfloat16", (WorkerDict pid=664049, ip=[PRIVATE_IP]) "transformers_version": "4.51.3", (WorkerDict pid=664049, ip=[PRIVATE_IP]) "use_cache": true, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "use_sliding_window": false, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "vocab_size": 151936 (WorkerDict pid=664049, ip=[PRIVATE_IP]) } (WorkerDict pid=664049, ip=[PRIVATE_IP]) (WorkerDict pid=1691639, ip=[PRIVATE_IP]) load ref weight start (WorkerDict pid=664049, ip=[PRIVATE_IP]) > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 5560209152 (WorkerDict pid=664049, ip=[PRIVATE_IP]) NCCL version 2.21.5+cuda12.4 (WorkerDict pid=1691638, ip=[PRIVATE_IP]) TransformerConfig(tensor_model_parallel_size=2, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=32, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=torch.bfloat16, variable_seq_lengths=True, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=2, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=94, mtp_num_layers=None, mtp_loss_scaling_factor=None, num_layers_in_first_pipeline_stage=None, num_layers_in_last_pipeline_stage=None, account_for_embedding_in_pipeline_split=False, account_for_loss_in_pipeline_split=False, hidden_size=4096, num_attention_heads=64, attention_backend=, softmax_scale=None, num_query_groups=4, ffn_hidden_size=12288, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=False, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=128, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=True, test_mode=False, calculate_per_token_loss=False, multi_latent_attention=False, init_method=functools.partial(, mean=0.0, std=0.02), output_layer_init_method=functools.partial(, mean=0.0, std=0.0014586499149789457), init_method_std=0.02, init_model_with_meta_device=False, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, disable_bf16_reduced_precision_matmul=False, bias_activation_fusion=True, masked_softmax_fusion=True, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=None, fp8=None, fp8_recipe='delayed', fp8_param=False, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, tp_only_amax_red=False, first_last_layers_bf16=False, num_layers_at_start_in_bf16=1, num_layers_at_end_in_bf16=1, moe_shared_expert_intermediate_size=None, moe_shared_expert_overlap=False, moe_layer_freq=1, moe_ffn_hidden_size=1536, moe_router_load_balancing_type='aux_loss', moe_router_topk=8, moe_router_topk_limited_devices=None, moe_router_num_groups=None, moe_router_group_topk=None, moe_router_pre_softmax=False, moe_router_topk_scaling_factor=None, moe_router_score_function='softmax', moe_router_dtype=None, moe_router_enable_expert_bias=False, moe_router_bias_update_rate=0.001, moe_grouped_gemm=True, moe_use_legacy_grouped_gemm=False, moe_aux_loss_coeff=0.001, moe_z_loss_coeff=None, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='alltoall', moe_enable_deepep=False, moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, moe_permute_fusion=False, cp_comm_type=None, enable_cuda_graph=False, cuda_graph_use_single_mempool=False, cuda_graph_retain_backward_graph=False, cuda_graph_warmup_steps=3, external_cuda_graph=False, cuda_graph_scope='full', clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, config_logger_dir='', flash_decode=False, inference_rng_tracker=False, mrope_section=None, use_custom_fsdp=False, is_hybrid_model=False, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8) (WorkerDict pid=664049, ip=[PRIVATE_IP]) Overridden TF init config: {'num_layers': 94, 'hidden_size': 4096, 'num_attention_heads': 64, 'num_query_groups': 4, 'ffn_hidden_size': 12288, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'kv_channels': 128, 'layernorm_epsilon': 1e-06, 'activation_func': , 'normalization': 'RMSNorm', 'gated_linear_unit': True, 'pipeline_dtype': torch.bfloat16, 'params_dtype': torch.bfloat16, 'bf16': True, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 2, 'expert_model_parallel_size': 32, 'expert_tensor_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'overlap_p2p_comm': False, 'batch_p2p_comm': False, 'sequence_parallel': True, 'variable_seq_lengths': True, 'masked_softmax_fusion': True, 'moe_token_dispatcher_type': 'alltoall', 'use_cpu_initialization': False, 'add_bias_linear': False, 'moe_ffn_hidden_size': 1536, 'moe_router_bias_update_rate': 0.001, 'moe_router_topk': 8, 'num_moe_experts': 128, 'moe_aux_loss_coeff': 0.001, 'moe_router_load_balancing_type': 'aux_loss', 'moe_grouped_gemm': True, 'moe_router_score_function': 'softmax', 'persist_layer_norm': True, 'bias_activation_fusion': True, 'bias_dropout_fusion': True, 'moe_router_pre_softmax': False, 'qk_layernorm': True} [repeated 127x across cluster] (WorkerDict pid=664049, ip=[PRIVATE_IP]) TF config: TransformerConfig(tensor_model_parallel_size=2, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=32, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=torch.bfloat16, variable_seq_lengths=True, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=2, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=94, mtp_num_layers=None, mtp_loss_scaling_factor=None, num_layers_in_first_pipeline_stage=None, num_layers_in_last_pipeline_stage=None, account_for_embedding_in_pipeline_split=False, account_for_loss_in_pipeline_split=False, hidden_size=4096, num_attention_heads=64, attention_backend=, softmax_scale=None, num_query_groups=4, ffn_hidden_size=12288, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=False, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=128, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=True, test_mode=False, calculate_per_token_loss=False, multi_latent_attention=False, init_method=functools.partial(, mean=0.0, std=0.02), output_layer_init_method=functools.partial(, mean=0.0, std=0.0014586499149789457), init_method_std=0.02, init_model_with_meta_device=False, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, disable_bf16_reduced_precision_matmul=False, bias_activation_fusion=True, masked_softmax_fusion=True, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=None, fp8=None, fp8_recipe='delayed', fp8_param=False, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, tp_only_amax_red=False, first_last_layers_bf16=False, num_layers_at_start_in_bf16=1, num_layers_at_end_in_bf16=1, moe_shared_expert_intermediate_size=None, moe_shared_expert_overlap=False, moe_layer_freq=1, moe_ffn_hidden_size=1536, moe_router_load_balancing_type='aux_loss', moe_router_topk=8, moe_router_topk_limited_devices=None, moe_router_num_groups=None, moe_router_group_topk=None, moe_router_pre_softmax=False, moe_router_topk_scaling_factor=None, moe_router_score_function='softmax', moe_router_dtype=None, moe_router_enable_expert_bias=False, moe_router_bias_update_rate=0.001, moe_grouped_gemm=True, moe_use_legacy_grouped_gemm=False, moe_aux_loss_coeff=0.001, moe_z_loss_coeff=None, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='alltoall', moe_enable_deepep=False, moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, moe_permute_fusion=False, cp_comm_type=None, enable_cuda_graph=False, cuda_graph_use_single_mempool=False, cuda_graph_retain_backward_graph=False, cuda_graph_warmup_steps=3, external_cuda_graph=False, cuda_graph_scope='full', clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, config_logger_dir='', flash_decode=False, inference_rng_tracker=False, mrope_section=None, use_custom_fsdp=False, is_hybrid_model=False, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8) [repeated 127x across cluster] (WorkerDict pid=664049, ip=[PRIVATE_IP]) self.config.ref.load_weight: True [repeated 127x across cluster] (WorkerDict pid=2194640) load ref weight start [repeated 127x across cluster] (WorkerDict pid=664239, ip=[PRIVATE_IP]) > number of parameters on (tensor, pipeline) model parallel rank (1, 0): 5560209152 [repeated 3x across cluster] (WorkerDict pid=1691642, ip=[PRIVATE_IP]) TransformerConfig(tensor_model_parallel_size=2, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=32, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=torch.bfloat16, variable_seq_lengths=True, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=2, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=94, mtp_num_layers=None, mtp_loss_scaling_factor=None, num_layers_in_first_pipeline_stage=None, num_layers_in_last_pipeline_stage=None, account_for_embedding_in_pipeline_split=False, account_for_loss_in_pipeline_split=False, hidden_size=4096, num_attention_heads=64, attention_backend=, softmax_scale=None, num_query_groups=4, ffn_hidden_size=12288, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=False, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=128, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=True, test_mode=False, calculate_per_token_loss=False, multi_latent_attention=False, init_method=functools.partial(, mean=0.0, std=0.02), output_layer_init_method=functools.partial(, mean=0.0, std=0.0014586499149789457), init_method_std=0.02, init_model_with_meta_device=False, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, disable_bf16_reduced_precision_matmul=False, bias_activation_fusion=True, masked_softmax_fusion=True, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=None, fp8=None, fp8_recipe='delayed', fp8_param=False, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, tp_only_amax_red=False, first_last_layers_bf16=False, num_layers_at_start_in_bf16=1, num_layers_at_end_in_bf16=1, moe_shared_expert_intermediate_size=None, moe_shared_expert_overlap=False, moe_layer_freq=1, moe_ffn_hidden_size=1536, moe_router_load_balancing_type='aux_loss', moe_router_topk=8, moe_router_topk_limited_devices=None, moe_router_num_groups=None, moe_router_group_topk=None, moe_router_pre_softmax=False, moe_router_topk_scaling_factor=None, moe_router_score_function='softmax', moe_router_dtype=None, moe_router_enable_expert_bias=False, moe_router_bias_update_rate=0.001, moe_grouped_gemm=True, moe_use_legacy_grouped_gemm=False, moe_aux_loss_coeff=0.001, moe_z_loss_coeff=None, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='alltoall', moe_enable_deepep=False, moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, moe_permute_fusion=False, cp_comm_type=None, enable_cuda_graph=False, cuda_graph_use_single_mempool=False, cuda_graph_retain_backward_graph=False, cuda_graph_warmup_steps=3, external_cuda_graph=False, cuda_graph_scope='full', clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, config_logger_dir='', flash_decode=False, inference_rng_tracker=False, mrope_section=None, use_custom_fsdp=False, is_hybrid_model=False, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8) [repeated 127x across cluster] (WorkerDict pid=1898926, ip=[PRIVATE_IP]) Overridden TF init config: {'num_layers': 94, 'hidden_size': 4096, 'num_attention_heads': 64, 'num_query_groups': 4, 'ffn_hidden_size': 12288, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'kv_channels': 128, 'layernorm_epsilon': 1e-06, 'activation_func': , 'normalization': 'RMSNorm', 'gated_linear_unit': True, 'pipeline_dtype': torch.bfloat16, 'params_dtype': torch.bfloat16, 'bf16': True, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 2, 'expert_model_parallel_size': 32, 'expert_tensor_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'overlap_p2p_comm': False, 'batch_p2p_comm': False, 'sequence_parallel': True, 'variable_seq_lengths': True, 'masked_softmax_fusion': True, 'moe_token_dispatcher_type': 'alltoall', 'use_cpu_initialization': False, 'add_bias_linear': False, 'moe_ffn_hidden_size': 1536, 'moe_router_bias_update_rate': 0.001, 'moe_router_topk': 8, 'num_moe_experts': 128, 'moe_aux_loss_coeff': 0.001, 'moe_router_load_balancing_type': 'aux_loss', 'moe_grouped_gemm': True, 'moe_router_score_function': 'softmax', 'persist_layer_norm': True, 'bias_activation_fusion': True, 'bias_dropout_fusion': True, 'moe_router_pre_softmax': False, 'qk_layernorm': True} (WorkerDict pid=1898926, ip=[PRIVATE_IP]) TF config: TransformerConfig(tensor_model_parallel_size=2, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=32, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=torch.bfloat16, variable_seq_lengths=True, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=2, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=94, mtp_num_layers=None, mtp_loss_scaling_factor=None, num_layers_in_first_pipeline_stage=None, num_layers_in_last_pipeline_stage=None, account_for_embedding_in_pipeline_split=False, account_for_loss_in_pipeline_split=False, hidden_size=4096, num_attention_heads=64, attention_backend=, softmax_scale=None, num_query_groups=4, ffn_hidden_size=12288, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=False, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=128, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=True, test_mode=False, calculate_per_token_loss=False, multi_latent_attention=False, init_method=functools.partial(, mean=0.0, std=0.02), output_layer_init_method=functools.partial(, mean=0.0, std=0.0014586499149789457), init_method_std=0.02, init_model_with_meta_device=False, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, disable_bf16_reduced_precision_matmul=False, bias_activation_fusion=True, masked_softmax_fusion=True, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=None, fp8=None, fp8_recipe='delayed', fp8_param=False, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, tp_only_amax_red=False, first_last_layers_bf16=False, num_layers_at_start_in_bf16=1, num_layers_at_end_in_bf16=1, moe_shared_expert_intermediate_size=None, moe_shared_expert_overlap=False, moe_layer_freq=1, moe_ffn_hidden_size=1536, moe_router_load_balancing_type='aux_loss', moe_router_topk=8, moe_router_topk_limited_devices=None, moe_router_num_groups=None, moe_router_group_topk=None, moe_router_pre_softmax=False, moe_router_topk_scaling_factor=None, moe_router_score_function='softmax', moe_router_dtype=None, moe_router_enable_expert_bias=False, moe_router_bias_update_rate=0.001, moe_grouped_gemm=True, moe_use_legacy_grouped_gemm=False, moe_aux_loss_coeff=0.001, moe_z_loss_coeff=None, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='alltoall', moe_enable_deepep=False, moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, moe_permute_fusion=False, cp_comm_type=None, enable_cuda_graph=False, cuda_graph_use_single_mempool=False, cuda_graph_retain_backward_graph=False, cuda_graph_warmup_steps=3, external_cuda_graph=False, cuda_graph_scope='full', clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, config_logger_dir='', flash_decode=False, inference_rng_tracker=False, mrope_section=None, use_custom_fsdp=False, is_hybrid_model=False, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8) (WorkerDict pid=1898925, ip=[PRIVATE_IP]) Overridden TF init config: {'num_layers': 94, 'hidden_size': 4096, 'num_attention_heads': 64, 'num_query_groups': 4, 'ffn_hidden_size': 12288, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'kv_channels': 128, 'layernorm_epsilon': 1e-06, 'activation_func': , 'normalization': 'RMSNorm', 'gated_linear_unit': True, 'pipeline_dtype': torch.bfloat16, 'params_dtype': torch.bfloat16, 'bf16': True, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 2, 'expert_model_parallel_size': 32, 'expert_tensor_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'overlap_p2p_comm': False, 'batch_p2p_comm': False, 'sequence_parallel': True, 'variable_seq_lengths': True, 'masked_softmax_fusion': True, 'moe_token_dispatcher_type': 'alltoall', 'use_cpu_initialization': False, 'add_bias_linear': False, 'moe_ffn_hidden_size': 1536, 'moe_router_bias_update_rate': 0.001, 'moe_router_topk': 8, 'num_moe_experts': 128, 'moe_aux_loss_coeff': 0.001, 'moe_router_load_balancing_type': 'aux_loss', 'moe_grouped_gemm': True, 'moe_router_score_function': 'softmax', 'persist_layer_norm': True, 'bias_activation_fusion': True, 'bias_dropout_fusion': True, 'moe_router_pre_softmax': False, 'qk_layernorm': True} (WorkerDict pid=1898925, ip=[PRIVATE_IP]) TF config: TransformerConfig(tensor_model_parallel_size=2, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=32, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=torch.bfloat16, variable_seq_lengths=True, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=2, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=94, mtp_num_layers=None, mtp_loss_scaling_factor=None, num_layers_in_first_pipeline_stage=None, num_layers_in_last_pipeline_stage=None, account_for_embedding_in_pipeline_split=False, account_for_loss_in_pipeline_split=False, hidden_size=4096, num_attention_heads=64, attention_backend=, softmax_scale=None, num_query_groups=4, ffn_hidden_size=12288, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=False, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=128, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=True, test_mode=False, calculate_per_token_loss=False, multi_latent_attention=False, init_method=functools.partial(, mean=0.0, std=0.02), output_layer_init_method=functools.partial(, mean=0.0, std=0.0014586499149789457), init_method_std=0.02, init_model_with_meta_device=False, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, disable_bf16_reduced_precision_matmul=False, bias_activation_fusion=True, masked_softmax_fusion=True, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=None, fp8=None, fp8_recipe='delayed', fp8_param=False, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, tp_only_amax_red=False, first_last_layers_bf16=False, num_layers_at_start_in_bf16=1, num_layers_at_end_in_bf16=1, moe_shared_expert_intermediate_size=None, moe_shared_expert_overlap=False, moe_layer_freq=1, moe_ffn_hidden_size=1536, moe_router_load_balancing_type='aux_loss', moe_router_topk=8, moe_router_topk_limited_devices=None, moe_router_num_groups=None, moe_router_group_topk=None, moe_router_pre_softmax=False, moe_router_topk_scaling_factor=None, moe_router_score_function='softmax', moe_router_dtype=None, moe_router_enable_expert_bias=False, moe_router_bias_update_rate=0.001, moe_grouped_gemm=True, moe_use_legacy_grouped_gemm=False, moe_aux_loss_coeff=0.001, moe_z_loss_coeff=None, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='alltoall', moe_enable_deepep=False, moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, moe_permute_fusion=False, cp_comm_type=None, enable_cuda_graph=False, cuda_graph_use_single_mempool=False, cuda_graph_retain_backward_graph=False, cuda_graph_warmup_steps=3, external_cuda_graph=False, cuda_graph_scope='full', clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, config_logger_dir='', flash_decode=False, inference_rng_tracker=False, mrope_section=None, use_custom_fsdp=False, is_hybrid_model=False, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8) (WorkerDict pid=664049, ip=[PRIVATE_IP]) Model config after override: Qwen3MoeConfig { (WorkerDict pid=664049, ip=[PRIVATE_IP]) "architectures": [ (WorkerDict pid=664049, ip=[PRIVATE_IP]) "Qwen3MoeForCausalLM" (WorkerDict pid=664049, ip=[PRIVATE_IP]) ], (WorkerDict pid=664049, ip=[PRIVATE_IP]) "attention_bias": false, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "attention_dropout": 0.0, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "decoder_sparse_step": 1, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "eos_token_id": 151645, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "head_dim": 128, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "hidden_act": "silu", (WorkerDict pid=664049, ip=[PRIVATE_IP]) "hidden_size": 4096, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "initializer_range": 0.02, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "intermediate_size": 12288, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "max_position_embeddings": 40960, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "max_window_layers": 94, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "mlp_only_layers": [], (WorkerDict pid=664049, ip=[PRIVATE_IP]) "model_type": "qwen3_moe", (WorkerDict pid=664049, ip=[PRIVATE_IP]) "moe_intermediate_size": 1536, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "norm_topk_prob": true, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_attention_heads": 64, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_experts": 128, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_experts_per_tok": 8, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_hidden_layers": 94, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "num_key_value_heads": 4, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "output_router_logits": false, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "pad_token_id": 151643, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "rms_norm_eps": 1e-06, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "rope_scaling": null, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "rope_theta": 1000000.0, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "router_aux_loss_coef": 0.001, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "sliding_window": null, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "tie_word_embeddings": false, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "torch_dtype": "bfloat16", (WorkerDict pid=664049, ip=[PRIVATE_IP]) "transformers_version": "4.51.3", (WorkerDict pid=664049, ip=[PRIVATE_IP]) "use_cache": true, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "use_sliding_window": false, (WorkerDict pid=664049, ip=[PRIVATE_IP]) "vocab_size": 151936 (WorkerDict pid=664049, ip=[PRIVATE_IP]) } (WorkerDict pid=664049, ip=[PRIVATE_IP]) (WorkerDict pid=664049, ip=[PRIVATE_IP]) > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 5560209152 (WorkerDict pid=1639633, ip=[PRIVATE_IP]) NCCL version 2.21.5+cuda12.4 (WorkerDict pid=2194634) Overridden TF init config: {'num_layers': 94, 'hidden_size': 4096, 'num_attention_heads': 64, 'num_query_groups': 4, 'ffn_hidden_size': 12288, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'kv_channels': 128, 'layernorm_epsilon': 1e-06, 'activation_func': , 'normalization': 'RMSNorm', 'gated_linear_unit': True, 'pipeline_dtype': torch.bfloat16, 'params_dtype': torch.bfloat16, 'bf16': True, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 2, 'expert_model_parallel_size': 32, 'expert_tensor_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'overlap_p2p_comm': False, 'batch_p2p_comm': False, 'sequence_parallel': True, 'variable_seq_lengths': True, 'masked_softmax_fusion': True, 'moe_token_dispatcher_type': 'alltoall', 'use_cpu_initialization': False, 'add_bias_linear': False, 'moe_ffn_hidden_size': 1536, 'moe_router_bias_update_rate': 0.001, 'moe_router_topk': 8, 'num_moe_experts': 128, 'moe_aux_loss_coeff': 0.001, 'moe_router_load_balancing_type': 'aux_loss', 'moe_grouped_gemm': True, 'moe_router_score_function': 'softmax', 'persist_layer_norm': True, 'bias_activation_fusion': True, 'bias_dropout_fusion': True, 'moe_router_pre_softmax': False, 'qk_layernorm': True} [repeated 126x across cluster] (WorkerDict pid=2194634) TF config: TransformerConfig(tensor_model_parallel_size=2, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=32, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=torch.bfloat16, variable_seq_lengths=True, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=2, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=94, mtp_num_layers=None, mtp_loss_scaling_factor=None, num_layers_in_first_pipeline_stage=None, num_layers_in_last_pipeline_stage=None, account_for_embedding_in_pipeline_split=False, account_for_loss_in_pipeline_split=False, hidden_size=4096, num_attention_heads=64, attention_backend=, softmax_scale=None, num_query_groups=4, ffn_hidden_size=12288, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=False, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=128, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=True, test_mode=False, calculate_per_token_loss=False, multi_latent_attention=False, init_method=functools.partial(, mean=0.0, std=0.02), output_layer_init_method=functools.partial(, mean=0.0, std=0.0014586499149789457), init_method_std=0.02, init_model_with_meta_device=False, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, disable_bf16_reduced_precision_matmul=False, bias_activation_fusion=True, masked_softmax_fusion=True, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=None, fp8=None, fp8_recipe='delayed', fp8_param=False, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, tp_only_amax_red=False, first_last_layers_bf16=False, num_layers_at_start_in_bf16=1, num_layers_at_end_in_bf16=1, moe_shared_expert_intermediate_size=None, moe_shared_expert_overlap=False, moe_layer_freq=1, moe_ffn_hidden_size=1536, moe_router_load_balancing_type='aux_loss', moe_router_topk=8, moe_router_topk_limited_devices=None, moe_router_num_groups=None, moe_router_group_topk=None, moe_router_pre_softmax=False, moe_router_topk_scaling_factor=None, moe_router_score_function='softmax', moe_router_dtype=None, moe_router_enable_expert_bias=False, moe_router_bias_update_rate=0.001, moe_grouped_gemm=True, moe_use_legacy_grouped_gemm=False, moe_aux_loss_coeff=0.001, moe_z_loss_coeff=None, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='alltoall', moe_enable_deepep=False, moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, moe_permute_fusion=False, cp_comm_type=None, enable_cuda_graph=False, cuda_graph_use_single_mempool=False, cuda_graph_retain_backward_graph=False, cuda_graph_warmup_steps=3, external_cuda_graph=False, cuda_graph_scope='full', clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, config_logger_dir='', flash_decode=False, inference_rng_tracker=False, mrope_section=None, use_custom_fsdp=False, is_hybrid_model=False, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8) [repeated 126x across cluster] (WorkerDict pid=1821700, ip=[PRIVATE_IP]) actor_module: 1 (WorkerDict pid=1639635, ip=[PRIVATE_IP]) > number of parameters on (tensor, pipeline) model parallel rank (1, 1): 5560213248 [repeated 3x across cluster] (WorkerDict pid=1639640, ip=[PRIVATE_IP]) NCCL version 2.21.5+cuda12.4 [repeated 17x across cluster] (WorkerDict pid=664049, ip=[PRIVATE_IP]) DistributedDataParallel contains 5.56B parameters (WorkerDict pid=1889683, ip=[PRIVATE_IP]) actor_module: 1 [repeated 127x across cluster] (WorkerDict pid=664241, ip=[PRIVATE_IP]) NCCL version 2.21.5+cuda12.4 [repeated 45x across cluster] (WorkerDict pid=1898926, ip=[PRIVATE_IP]) TransformerConfig(tensor_model_parallel_size=2, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=32, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=torch.bfloat16, variable_seq_lengths=True, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=2, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=94, mtp_num_layers=None, mtp_loss_scaling_factor=None, num_layers_in_first_pipeline_stage=None, num_layers_in_last_pipeline_stage=None, account_for_embedding_in_pipeline_split=False, account_for_loss_in_pipeline_split=False, hidden_size=4096, num_attention_heads=64, attention_backend=, softmax_scale=None, num_query_groups=4, ffn_hidden_size=12288, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=False, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=128, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=True, test_mode=False, calculate_per_token_loss=False, multi_latent_attention=False, init_method=functools.partial(, mean=0.0, std=0.02), output_layer_init_method=functools.partial(, mean=0.0, std=0.0014586499149789457), init_method_std=0.02, init_model_with_meta_device=False, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, disable_bf16_reduced_precision_matmul=False, bias_activation_fusion=True, masked_softmax_fusion=True, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=None, fp8=None, fp8_recipe='delayed', fp8_param=False, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, tp_only_amax_red=False, first_last_layers_bf16=False, num_layers_at_start_in_bf16=1, num_layers_at_end_in_bf16=1, moe_shared_expert_intermediate_size=None, moe_shared_expert_overlap=False, moe_layer_freq=1, moe_ffn_hidden_size=1536, moe_router_load_balancing_type='aux_loss', moe_router_topk=8, moe_router_topk_limited_devices=None, moe_router_num_groups=None, moe_router_group_topk=None, moe_router_pre_softmax=False, moe_router_topk_scaling_factor=None, moe_router_score_function='softmax', moe_router_dtype=None, moe_router_enable_expert_bias=False, moe_router_bias_update_rate=0.001, moe_grouped_gemm=True, moe_use_legacy_grouped_gemm=False, moe_aux_loss_coeff=0.001, moe_z_loss_coeff=None, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='alltoall', moe_enable_deepep=False, moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, moe_permute_fusion=False, cp_comm_type=None, enable_cuda_graph=False, cuda_graph_use_single_mempool=False, cuda_graph_retain_backward_graph=False, cuda_graph_warmup_steps=3, external_cuda_graph=False, cuda_graph_scope='full', clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, config_logger_dir='', flash_decode=False, inference_rng_tracker=False, mrope_section=None, use_custom_fsdp=False, is_hybrid_model=False, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8) (WorkerDict pid=664049, ip=[PRIVATE_IP]) Before building vllm rollout, memory allocated (GB): 19.87, memory reserved (GB): 19.88, device memory used/total (GB): 26.01/139.81 (WorkerDict pid=1898926, ip=[PRIVATE_IP]) WARNING 05-23 07:49:59 [cuda.py:93] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used (WorkerDict pid=2194634) TransformerConfig(tensor_model_parallel_size=2, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=None, sequence_parallel=True, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=32, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=True, params_dtype=torch.bfloat16, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.bfloat16, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=torch.bfloat16, variable_seq_lengths=True, overlap_p2p_comm=False, batch_p2p_comm=False, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=2, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=True, cpu_offloading_weights=True, barrier_with_L1_time=True, num_layers=94, mtp_num_layers=None, mtp_loss_scaling_factor=None, num_layers_in_first_pipeline_stage=None, num_layers_in_last_pipeline_stage=None, account_for_embedding_in_pipeline_split=False, account_for_loss_in_pipeline_split=False, hidden_size=4096, num_attention_heads=64, attention_backend=, softmax_scale=None, num_query_groups=4, ffn_hidden_size=12288, kv_channels=128, hidden_dropout=0.0, attention_dropout=0.0, fp32_residual_connection=False, apply_residual_connection_post_layernorm=False, layernorm_epsilon=1e-06, layernorm_zero_centered_gamma=False, add_bias_linear=False, add_qkv_bias=False, gated_linear_unit=True, activation_func=, activation_func_fp8_input_store=False, num_moe_experts=128, rotary_interleaved=False, window_size=None, normalization='RMSNorm', qk_layernorm=True, test_mode=False, calculate_per_token_loss=False, multi_latent_attention=False, init_method=functools.partial(, mean=0.0, std=0.02), output_layer_init_method=functools.partial(, mean=0.0, std=0.0014586499149789457), init_method_std=0.02, init_model_with_meta_device=False, apply_query_key_layer_scaling=False, attention_softmax_in_fp32=True, disable_bf16_reduced_precision_matmul=False, bias_activation_fusion=True, masked_softmax_fusion=True, persist_layer_norm=True, memory_efficient_layer_norm=False, bias_dropout_fusion=True, apply_rope_fusion=False, recompute_granularity=None, recompute_method=None, recompute_num_layers=None, distribute_saved_activations=None, fp8=None, fp8_recipe='delayed', fp8_param=False, fp8_margin=0, fp8_interval=1, fp8_amax_history_len=1, fp8_amax_compute_algo='most_recent', fp8_wgrad=True, fp8_dot_product_attention=False, fp8_multi_head_attention=False, tp_only_amax_red=False, first_last_layers_bf16=False, num_layers_at_start_in_bf16=1, num_layers_at_end_in_bf16=1, moe_shared_expert_intermediate_size=None, moe_shared_expert_overlap=False, moe_layer_freq=1, moe_ffn_hidden_size=1536, moe_router_load_balancing_type='aux_loss', moe_router_topk=8, moe_router_topk_limited_devices=None, moe_router_num_groups=None, moe_router_group_topk=None, moe_router_pre_softmax=False, moe_router_topk_scaling_factor=None, moe_router_score_function='softmax', moe_router_dtype=None, moe_router_enable_expert_bias=False, moe_router_bias_update_rate=0.001, moe_grouped_gemm=True, moe_use_legacy_grouped_gemm=False, moe_aux_loss_coeff=0.001, moe_z_loss_coeff=None, moe_input_jitter_eps=None, moe_token_dropping=False, moe_token_dispatcher_type='alltoall', moe_enable_deepep=False, moe_per_layer_logging=False, moe_expert_capacity_factor=None, moe_pad_expert_input_to_capacity=False, moe_token_drop_policy='probs', moe_layer_recompute=False, moe_permute_fusion=False, cp_comm_type=None, enable_cuda_graph=False, cuda_graph_use_single_mempool=False, cuda_graph_retain_backward_graph=False, cuda_graph_warmup_steps=3, external_cuda_graph=False, cuda_graph_scope='full', clone_scatter_output_in_embedding=True, disable_parameter_transpose_cache=False, config_logger_dir='', flash_decode=False, inference_rng_tracker=False, mrope_section=None, use_custom_fsdp=False, is_hybrid_model=False, mamba_state_dim=128, mamba_head_dim=64, mamba_num_groups=8) [repeated 127x across cluster] (WorkerDict pid=1898926, ip=[PRIVATE_IP]) WARNING 05-23 07:49:59 [utils.py:2522] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in (WorkerDict pid=1616556, ip=[PRIVATE_IP]) NCCL version 2.21.5+cuda12.4 (WorkerDict pid=2194634) WARNING 05-23 07:50:02 [cuda.py:93] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used [repeated 127x across cluster] (WorkerDict pid=1898929, ip=[PRIVATE_IP]) kwargs: {'n': 8, 'logprobs': 0, 'max_tokens': 4096, 'detokenize': False, 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'ignore_eos': False} (WorkerDict pid=1898929, ip=[PRIVATE_IP]) Overridden TF init config: {'num_layers': 94, 'hidden_size': 4096, 'num_attention_heads': 64, 'num_query_groups': 4, 'ffn_hidden_size': 12288, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'kv_channels': 128, 'layernorm_epsilon': 1e-06, 'activation_func': , 'normalization': 'RMSNorm', 'gated_linear_unit': True, 'pipeline_dtype': torch.bfloat16, 'params_dtype': torch.bfloat16, 'bf16': True, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 2, 'expert_model_parallel_size': 32, 'expert_tensor_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'overlap_p2p_comm': False, 'batch_p2p_comm': False, 'sequence_parallel': True, 'variable_seq_lengths': True, 'masked_softmax_fusion': True, 'moe_token_dispatcher_type': 'alltoall', 'use_cpu_initialization': False, 'add_bias_linear': False, 'moe_ffn_hidden_size': 1536, 'moe_router_bias_update_rate': 0.001, 'moe_router_topk': 8, 'num_moe_experts': 128, 'moe_aux_loss_coeff': 0.001, 'moe_router_load_balancing_type': 'aux_loss', 'moe_grouped_gemm': True, 'moe_router_score_function': 'softmax', 'persist_layer_norm': True, 'bias_activation_fusion': True, 'bias_dropout_fusion': True, 'moe_router_pre_softmax': False, 'qk_layernorm': True} (WorkerDict pid=2194634) WARNING 05-23 07:50:03 [utils.py:2522] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in  [repeated 127x across cluster] (WorkerDict pid=1821694, ip=[PRIVATE_IP]) NCCL version 2.21.5+cuda12.4 [repeated 7x across cluster] (WorkerDict pid=2194639) kwargs: {'n': 8, 'logprobs': 0, 'max_tokens': 4096, 'detokenize': False, 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'ignore_eos': False} [repeated 120x across cluster] (WorkerDict pid=2194639) Overridden TF init config: {'num_layers': 94, 'hidden_size': 4096, 'num_attention_heads': 64, 'num_query_groups': 4, 'ffn_hidden_size': 12288, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'kv_channels': 128, 'layernorm_epsilon': 1e-06, 'activation_func': , 'normalization': 'RMSNorm', 'gated_linear_unit': True, 'pipeline_dtype': torch.bfloat16, 'params_dtype': torch.bfloat16, 'bf16': True, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 2, 'expert_model_parallel_size': 32, 'expert_tensor_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'overlap_p2p_comm': False, 'batch_p2p_comm': False, 'sequence_parallel': True, 'variable_seq_lengths': True, 'masked_softmax_fusion': True, 'moe_token_dispatcher_type': 'alltoall', 'use_cpu_initialization': False, 'add_bias_linear': False, 'moe_ffn_hidden_size': 1536, 'moe_router_bias_update_rate': 0.001, 'moe_router_topk': 8, 'num_moe_experts': 128, 'moe_aux_loss_coeff': 0.001, 'moe_router_load_balancing_type': 'aux_loss', 'moe_grouped_gemm': True, 'moe_router_score_function': 'softmax', 'persist_layer_norm': True, 'bias_activation_fusion': True, 'bias_dropout_fusion': True, 'moe_router_pre_softmax': False, 'qk_layernorm': True} [repeated 120x across cluster] (TaskRunner pid=2191279) Saving tensorboard log to /DATA_PATH//models/verl_grpo_megatron_gsm8k/qwen3_235b22b_moe_mcore_tp_etp_not_same/v20250523_074035/logs. (TaskRunner pid=2191279) Using LocalLogger is deprecated. The constructor API will change (TaskRunner pid=2191279) Checkpoint tracker file does not exist: %s /DATA_PATH//models/verl_grpo_megatron_gsm8k/qwen3_235b22b_moe_mcore_tp_etp_not_same/v20250523_074035/latest_checkpointed_iteration.txt (TaskRunner pid=2191279) Training from scratch (TaskRunner pid=2191279) test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True} (WorkerDict pid=2194640) kwargs: {'n': 8, 'logprobs': 0, 'max_tokens': 4096, 'detokenize': False, 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'ignore_eos': False} [repeated 7x across cluster] (WorkerDict pid=2194640) Overridden TF init config: {'num_layers': 94, 'hidden_size': 4096, 'num_attention_heads': 64, 'num_query_groups': 4, 'ffn_hidden_size': 12288, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'kv_channels': 128, 'layernorm_epsilon': 1e-06, 'activation_func': , 'normalization': 'RMSNorm', 'gated_linear_unit': True, 'pipeline_dtype': torch.bfloat16, 'params_dtype': torch.bfloat16, 'bf16': True, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 2, 'expert_model_parallel_size': 32, 'expert_tensor_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'overlap_p2p_comm': False, 'batch_p2p_comm': False, 'sequence_parallel': True, 'variable_seq_lengths': True, 'masked_softmax_fusion': True, 'moe_token_dispatcher_type': 'alltoall', 'use_cpu_initialization': False, 'add_bias_linear': False, 'moe_ffn_hidden_size': 1536, 'moe_router_bias_update_rate': 0.001, 'moe_router_topk': 8, 'num_moe_experts': 128, 'moe_aux_loss_coeff': 0.001, 'moe_router_load_balancing_type': 'aux_loss', 'moe_grouped_gemm': True, 'moe_router_score_function': 'softmax', 'persist_layer_norm': True, 'bias_activation_fusion': True, 'bias_dropout_fusion': True, 'moe_router_pre_softmax': False, 'qk_layernorm': True} [repeated 7x across cluster] (WorkerDict pid=2194639) NCCL version 2.21.5+cuda12.4 (TaskRunner pid=2191279) validation generation end (WorkerDict pid=1806693, ip=[PRIVATE_IP]) NCCL version 2.21.5+cuda12.4 [repeated 39x across cluster] (TaskRunner pid=2191279) [prompt] user (TaskRunner pid=2191279) Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? Let's think step by step and output the final answer after "####". (TaskRunner pid=2191279) assistant (TaskRunner pid=2191279) (TaskRunner pid=2191279) [response] (TaskRunner pid=2191279) Okay, let's see. Janet has ducks that lay 16 eggs each day. She uses some of those eggs for herself and her friends, and then sells the rest. The problem is asking how much money she makes every day from selling the eggs at the farmers' market. Let me break this down step by step. (TaskRunner pid=2191279) (TaskRunner pid=2191279) First, the total eggs laid per day are 16. Then she eats three for breakfast. So subtract those three from the total. Then she bakes muffins with four eggs. So subtract another four. The remaining eggs are the ones she sells. Each of those sells for $2. So I need to calculate the remaining eggs after her personal use and then multiply by 2 to get the total dollars. (TaskRunner pid=2191279) (TaskRunner pid=2191279) Let me write this out: (TaskRunner pid=2191279) (TaskRunner pid=2191279) Total eggs per day = 16 (TaskRunner pid=2191279) (TaskRunner pid=2191279) Eggs eaten for breakfast = 3 (TaskRunner pid=2191279) (TaskRunner pid=2191279) Eggs used for baking muffins = 4 (TaskRunner pid=2191279) (TaskRunner pid=2191279) So the eggs left for selling would be 16 minus 3 minus 4. Let me compute that. 16 - 3 is 13, then 13 -4 is 9. Wait, 16 minus 3 minus 4 equals 9? Hmm, 3+4 is 7, so 16-7 is 9. Yes, that's right. So she sells 9 eggs each day. (TaskRunner pid=2191279) (TaskRunner pid=2191279) Then, each egg sells for $2. So total money made is 9 * 2 = $18. So the answer should be 18 dollars. (TaskRunner pid=2191279) (TaskRunner pid=2191279) Wait, let me double-check. Maybe I made a mistake in subtraction. Let's see: 16 eggs total. She uses 3 +4 =7 eggs. 16-7 is indeed 9. Then 9 times $2 each. 9*2 is 18. That seems correct. So she makes $18 each day at the farmers' market. (TaskRunner pid=2191279) (TaskRunner pid=2191279) I don't see any other steps or factors mentioned in the problem. The problem says "the remainder" so that's all the eggs left after her personal use. And the price is per egg, $2 each. So yes, 9 times two. Yep, that's 18. I think that's the answer. (TaskRunner pid=2191279) (TaskRunner pid=2191279) (TaskRunner pid=2191279) Janet's ducks lay a total of 16 eggs per day. She uses 3 eggs for breakfast and 4 eggs for baking muffins. The total eggs used for personal purposes are: (TaskRunner pid=2191279) (TaskRunner pid=2191279) $$ (TaskRunner pid=2191279) 3 + 4 = 7 \text{ eggs} (TaskRunner pid=2191279) $$ (TaskRunner pid=2191279) (TaskRunner pid=2191279) Subtracting this from the total eggs laid: (TaskRunner pid=2191279) (TaskRunner pid=2191279) $$ (TaskRunner pid=2191279) 16 - 7 = 9 \text{ eggs remaining} (TaskRunner pid=2191279) $$ (TaskRunner pid=2191279) (TaskRunner pid=2191279) She sells these 9 eggs at the farmers' market for $2 per egg. Her daily earnings are: (TaskRunner pid=2191279) (TaskRunner pid=2191279) $$ (TaskRunner pid=2191279) 9 \times 2 = 18 \text{ dollars} (TaskRunner pid=2191279) $$ (TaskRunner pid=2191279) (TaskRunner pid=2191279) #### 18 (TaskRunner pid=2191279) [ground_truth] 18 (TaskRunner pid=2191279) [score] 1.0 (TaskRunner pid=2191279) ("Initial validation metrics: {'val-core/openai/gsm8k/reward/mean@1': " (TaskRunner pid=2191279) '0.5231235784685367}') (TaskRunner pid=2191279) step:0 - val-core/openai/gsm8k/reward/mean@1:0.523 (TaskRunner pid=2191279) list(reward_extra_infos_dict.keys())=[] (TaskRunner pid=2191279) step:1 - global_seqlen/min:2867.000 - global_seqlen/max:32364.000 - global_seqlen/minmax_diff:29497.000 - global_seqlen/balanced_min:11095.000 - global_seqlen/balanced_max:11146.000 - global_seqlen/mean:11123.141 - actor/entropy_loss:11.931 - actor/reward_kl_penalty:-0.000 - actor/reward_kl_penalty_coeff:0.001 - actor/kl_loss:0.010 - actor/kl_coef:0.001 - actor/pg_loss:-0.004 - actor/pg_clipfrac:0.037 - actor/ppo_kl:0.001 - actor/pg_clipfrac_lower:0.000 - actor/grad_norm:0.587 - actor/lr:0.000 - perf/mfu/actor:0.219 - training/global_step:1.000 - training/epoch:0.000 - critic/score/mean:0.474 - critic/score/max:1.000 - critic/score/min:0.000 - critic/rewards/mean:0.474 - critic/rewards/max:1.042 - critic/rewards/min:-0.023 - critic/advantages/mean:-0.038 - critic/advantages/max:2.475 - critic/advantages/min:-2.475 - critic/returns/mean:-0.038 - critic/returns/max:2.475 - critic/returns/min:-2.475 - response_length/mean:1306.096 - response_length/max:4096.000 - response_length/min:228.000 - response_length/clip_ratio:0.041 - prompt_length/mean:84.297 - prompt_length/max:149.000 - prompt_length/min:45.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:425.884 - timing_s/reward:0.514 - timing_s/old_log_prob:55.894 - timing_s/ref:27.195 - timing_s/adv:0.073 - timing_s/update_actor:71.430 - timing_s/step:581.591 - timing_per_token_ms/adv:0.000 - timing_per_token_ms/update_actor:0.050 - timing_per_token_ms/gen:0.318 - timing_per_token_ms/ref:0.019 - perf/total_num_tokens:1423762.000 - perf/time_per_step:581.591 - perf/throughput:19.125 (TaskRunner pid=2191279) list(reward_extra_infos_dict.keys())=[] (TaskRunner pid=2191279) step:2 - global_seqlen/min:2564.000 - global_seqlen/max:33480.000 - global_seqlen/minmax_diff:30916.000 - global_seqlen/balanced_min:10424.000 - global_seqlen/balanced_max:10733.000 - global_seqlen/mean:10587.719 - actor/entropy_loss:11.931 - actor/reward_kl_penalty:-0.000 - actor/reward_kl_penalty_coeff:0.001 - actor/kl_loss:0.009 - actor/kl_coef:0.001 - actor/pg_loss:0.122 - actor/pg_clipfrac:0.033 - actor/ppo_kl:0.000 - actor/pg_clipfrac_lower:0.000 - actor/grad_norm:0.576 - actor/lr:0.000 - perf/mfu/actor:0.210 - training/global_step:2.000 - training/epoch:0.000 - critic/score/mean:0.548 - critic/score/max:1.000 - critic/score/min:0.000 - critic/rewards/mean:0.548 - critic/rewards/max:1.021 - critic/rewards/min:-0.027 - critic/advantages/mean:-0.059 - critic/advantages/max:2.475 - critic/advantages/min:-2.475 - critic/returns/mean:-0.059 - critic/returns/max:2.475 - critic/returns/min:-2.475 - response_length/mean:1240.309 - response_length/max:4096.000 - response_length/min:209.000 - response_length/clip_ratio:0.044 - prompt_length/mean:83.156 - prompt_length/max:168.000 - prompt_length/min:49.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:403.466 - timing_s/reward:0.554 - timing_s/old_log_prob:30.288 - timing_s/ref:25.065 - timing_s/adv:0.085 - timing_s/update_actor:63.213 - timing_s/step:523.197 - timing_per_token_ms/adv:0.000 - timing_per_token_ms/update_actor:0.047 - timing_per_token_ms/gen:0.318 - timing_per_token_ms/ref:0.018 - perf/total_num_tokens:1355228.000 - perf/time_per_step:523.197 - perf/throughput:20.237 (TaskRunner pid=2191279) list(reward_extra_infos_dict.keys())=[] (TaskRunner pid=2191279) step:3 - global_seqlen/min:3168.000 - global_seqlen/max:33400.000 - global_seqlen/minmax_diff:30232.000 - global_seqlen/balanced_min:11320.000 - global_seqlen/balanced_max:11331.000 - global_seqlen/mean:11326.820 - actor/entropy_loss:11.931 - actor/reward_kl_penalty:0.000 - actor/reward_kl_penalty_coeff:0.001 - actor/kl_loss:0.010 - actor/kl_coef:0.001 - actor/pg_loss:-0.265 - actor/pg_clipfrac:0.039 - actor/ppo_kl:-0.001 - actor/pg_clipfrac_lower:0.000 - actor/grad_norm:0.584 - actor/lr:0.000 - perf/mfu/actor:0.246 - training/global_step:3.000 - training/epoch:0.000 - critic/score/mean:0.548 - critic/score/max:1.000 - critic/score/min:0.000 - critic/rewards/mean:0.548 - critic/rewards/max:1.026 - critic/rewards/min:-0.028 - critic/advantages/mean:-0.058 - critic/advantages/max:2.475 - critic/advantages/min:-2.475 - critic/returns/mean:-0.058 - critic/returns/max:2.475 - critic/returns/min:-2.475 - response_length/mean:1332.353 - response_length/max:4096.000 - response_length/min:243.000 - response_length/clip_ratio:0.054 - prompt_length/mean:83.500 - prompt_length/max:162.000 - prompt_length/min:44.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:437.658 - timing_s/reward:0.541 - timing_s/old_log_prob:30.934 - timing_s/ref:24.752 - timing_s/adv:0.077 - timing_s/update_actor:58.610 - timing_s/step:553.063 - timing_per_token_ms/adv:0.000 - timing_per_token_ms/update_actor:0.040 - timing_per_token_ms/gen:0.321 - timing_per_token_ms/ref:0.017 - perf/total_num_tokens:1449833.000 - perf/time_per_step:553.063 - perf/throughput:20.480 (TaskRunner pid=2191279) list(reward_extra_infos_dict.keys())=[] (TaskRunner pid=2191279) step:4 - global_seqlen/min:2969.000 - global_seqlen/max:34320.000 - global_seqlen/minmax_diff:31351.000 - global_seqlen/balanced_min:11344.000 - global_seqlen/balanced_max:11385.000 - global_seqlen/mean:11364.852 - actor/entropy_loss:11.931 - actor/reward_kl_penalty:0.000 - actor/reward_kl_penalty_coeff:0.001 - actor/kl_loss:0.010 - actor/kl_coef:0.001 - actor/pg_loss:-0.149 - actor/pg_clipfrac:0.037 - actor/ppo_kl:0.001 - actor/pg_clipfrac_lower:0.000 - actor/grad_norm:0.596 - actor/lr:0.000 - perf/mfu/actor:0.226 - training/global_step:4.000 - training/epoch:0.000 - critic/score/mean:0.616 - critic/score/max:1.000 - critic/score/min:0.000 - critic/rewards/mean:0.616 - critic/rewards/max:1.017 - critic/rewards/min:-0.032 - critic/advantages/mean:-0.031 - critic/advantages/max:2.475 - critic/advantages/min:-2.475 - critic/returns/mean:-0.031 - critic/returns/max:2.475 - critic/returns/min:-2.475 - response_length/mean:1335.911 - response_length/max:4096.000 - response_length/min:209.000 - response_length/clip_ratio:0.069 - prompt_length/mean:84.695 - prompt_length/max:194.000 - prompt_length/min:48.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:440.420 - timing_s/reward:0.525 - timing_s/old_log_prob:31.486 - timing_s/ref:24.538 - timing_s/adv:0.080 - timing_s/update_actor:63.980 - timing_s/step:561.594 - timing_per_token_ms/adv:0.000 - timing_per_token_ms/update_actor:0.044 - timing_per_token_ms/gen:0.322 - timing_per_token_ms/ref:0.017 - perf/total_num_tokens:1454701.000 - perf/time_per_step:561.594 - perf/throughput:20.237