#!/bin/bash ############################################# # User defined parameters and env vars SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) export NEURON_CC_FLAGS="--model-type=transformer --cache_dir=$HOME/neuron_compile_cache/" export NEURON_FUSE_SOFTMAX=1 # Async Runtime export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3 # Limit Number of NEFFs export NEURON_NUM_RECENT_MODELS_TO_KEEP=4 # HOST OOM export MALLOC_ARENA_MAX=64 # TP degree TP_DEGREE=8 # 0: bf16; 1: mixed precision USE_MIX_PRECISION=0 # 0: use pure DP; 1: use ZeRO-1 USE_ZERO_1=1 # global batch size GBS=32 # micro batch size MBS=1 # number of steps to run TOTAL_STEPS=100 # number of epochs to run TOTAL_EPOCHS=5 # warmup steps WARMUP_STEPS=20 # learning rate LR=5.0e-5 # model path MODEL_PATH=$SCRIPT_DIR # base model name BASE_MODEL="NousResearch/Llama-2-7b-hf" # sequence length SEQ_LEN=4096 # Path to dataset DATA_PATH="databricks/databricks-dolly-15k" ############################################# export NUM_NEURONCORES=32 NODE_ID=0 WORLD_SIZE=1 DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES" METRICS_FILE="results.json" if [ ! -z "$SLURM_NTASKS" ]; then WORLD_SIZE=$SLURM_NTASKS NODE_ID=$SLURM_NODEID MASTER_ADDRESS=(`scontrol show hostnames $SLURM_JOB_NODELIST`) DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE --node_rank $NODE_ID --master_addr $MASTER_ADDRESS --master_port 44000" if [ $NODE_ID -eq 0 ]; then echo "WORLD_SIZE=$WORLD_SIZE" echo "NODE_ID=$NODE_ID" echo "MASTER_ADDRESS=$MASTER_ADDRESS" echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS" fi export FI_EFA_USE_DEVICE_RDMA=1 export FI_PROVIDER=efa sudo sysctl -w net.ipv4.ip_local_reserved_ports=44000,48620 if [ $NEURON_EXTRACT_GRAPHS_ONLY -gt 0 ]; then STEPS_THIS_RUN=2 OUTPUT_LOG=log_compile-$NODE_ID.log elif [ -v PERF_TEST ] && [ $PERF_TEST -gt 0 ]; then STEPS_THIS_RUN=100 OUTPUT_LOG=log_exe-$NODE_ID.log else STEPS_THIS_RUN=-1 OUTPUT_LOG=log_exe-$NODE_ID.log fi elif [ -v OMPI_COMM_WORLD_RANK ]; then # Increase the fd limit for container ulimit -n 65535 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE NODE_ID=$OMPI_COMM_WORLD_RANK NODELIST=$(/root/nodelist_helper.py) HOSTS=(${NODELIST//\ / }) MASTER_ADDRESS=${HOSTS[0]} DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE --node_rank $NODE_ID --master_addr $MASTER_ADDRESS --master_port 44000" export FI_EFA_USE_DEVICE_RDMA=1 export FI_PROVIDER=efa export CCOM_SOCKET_IFNAME=eth0 export FI_EFA_FORK_SAFE=1 # Dataset is in shared location (Don't know where the s3 bucket for this artifact is to onboard to Kaizen) #DATA_PATH="$SHARED_PATH_PREFIX/examples_datasets/databricks-dolly-15k" # Store metrics in shared location METRICS_FILE=$ARTIFACT_PATH/results.json mkdir -p $ARTIFACT_PATH JOB_ID=$POD_UID export EXPLICIT_LOGDIR=null LOG_PATH="$ARTIFACT_PATH/logs/$JOB_ID/$NODE_ID" mkdir -p $LOG_PATH if [ $NEURON_EXTRACT_GRAPHS_ONLY -gt 0 ]; then STEPS_THIS_RUN=2 OUTPUT_LOG="$LOG_PATH/log_compile-$NODE_ID.log" elif [ -v PERF_TEST ] && [ $PERF_TEST -gt 0 ]; then STEPS_THIS_RUN=100 OUTPUT_LOG="$LOG_PATH/log_exe-$NODE_ID.log" else STEPS_THIS_RUN=-1 OUTPUT_LOG="$LOG_PATH/log_exe-$NODE_ID.log" fi fi echo "WORLD_SIZE=$WORLD_SIZE" echo "NODE_ID=$NODE_ID" echo "MASTER_ADDRESS=$MASTER_ADDRESS" export NEURON_RT_NUM_CORES=32 export NUM_NEURONCORES=$NEURON_RT_NUM_CORES export TPU_NUM_DEVICES=$NEURON_RT_NUM_CORES export TPU_CHIPS_PER_HOST_BOUNDS=$NEURON_RT_NUM_CORES ############################################# EXTRA_ARGS=" " if [ $USE_MIX_PRECISION -gt 0 ]; then EXTRA_ARGS+=" --use_mix_precision" fi if [ $USE_ZERO_1 -gt 0 ]; then EXTRA_ARGS+=" --use_zero_1" fi DP=$(($NEURON_RT_NUM_CORES * $WORLD_SIZE / $TP_DEGREE)) ACC_STEPS=$(($GBS / $MBS / $DP)) # Set default values if not set by SLURM or MPI sections if [ -z "$STEPS_THIS_RUN" ]; then if [ $NEURON_EXTRACT_GRAPHS_ONLY -gt 0 ]; then STEPS_THIS_RUN=2 OUTPUT_LOG=log_compile-$NODE_ID.log else STEPS_THIS_RUN=-1 OUTPUT_LOG=log_exe-$NODE_ID.log EXTRA_ARGS+=" --do_eval" fi else # If STEPS_THIS_RUN was set by SLURM/MPI but we're not compiling, add eval if [ $NEURON_EXTRACT_GRAPHS_ONLY -eq 0 ]; then EXTRA_ARGS+=" --do_eval" fi fi echo TP_DEGREE=$TP_DEGREE echo USE_MIX_PRECISION=$USE_MIX_PRECISION echo USE_ZERO_1=$USE_ZERO_1 echo GBS=$GBS echo MBS=$MBS echo TOTAL_STEPS=$TOTAL_STEPS echo TOTAL_EPOCHS=$TOTAL_EPOCHS echo WARMUP_STEPS=$WARMUP_STEPS echo LR=$LR echo MODEL_PATH=$MODEL_PATH echo SEQ_LEN=$SEQ_LEN echo EXTRA_ARGS=$EXTRA_ARGS echo DP=$DP echo ACC_STEPS=$ACC_STEPS echo STEPS_THIS_RUN=$STEPS_THIS_RUN echo OUTPUT_LOG=$OUTPUT_LOG torchrun $DISTRIBUTED_ARGS \ tp_llama_hf_finetune_ptl.py \ --model_path $MODEL_PATH \ --model_name $BASE_MODEL \ --data_dir $DATA_PATH \ --task "open_qa" \ --tensor_parallel_size $TP_DEGREE \ --batch_size $MBS \ --steps_this_run $STEPS_THIS_RUN\ --max_steps $TOTAL_STEPS \ --num_train_epochs $TOTAL_EPOCHS \ --warmup_steps $WARMUP_STEPS \ --lr $LR \ --grad_accum_usteps $ACC_STEPS \ --seq_len $SEQ_LEN \ --selective_checkpoint_enabled \ --fuse_qkv 0 \ $EXTRA_ARGS |& tee $OUTPUT_LOG exit ${PIPESTATUS[0]}