diff --git a/examples/tuning/1.5b/qwen2.5math-1.5b_grpo_1_h100_fsdp_vllm.sh b/examples/tuning/1.5b/qwen2.5math-1.5b_grpo_1_h100_fsdp_vllm.sh new file mode 100644 index 00000000000..a0bbb12e382 --- /dev/null +++ b/examples/tuning/1.5b/qwen2.5math-1.5b_grpo_1_h100_fsdp_vllm.sh @@ -0,0 +1,59 @@ +PROJECT_NAME=qwen2.5_Math_1.5b_dr_grpo +LR=1e-5 +MODEL_NAME=Qwen/Qwen2.5-Math-1.5B +MAX_NEW_TOKENS=3000 +MAX_PROMPT_LENGTH=1024 #1024 #512 +MODEL_ALIAS=${MODEL_NAME##*/} +GROUP_SIZE=8 +CHOSEN_DEVICE=0 #0,1 +TOTAL_EPOCHS=2 +LOG_N_VAL_GENERATIONS=40 +ADVANTAGE_ESTIMATOR="grpo" + + +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") +RUN_NAME=${MODEL_ALIAS}_${LR}_${GROUP_SIZE}_${MAX_NEW_TOKENS}_${ADVANTAGE_ESTIMATOR}_${TIMESTAMP} + +CUDA_VISIBLE_DEVICES=$CHOSEN_DEVICE python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=$ADVANTAGE_ESTIMATOR \ + data.train_files=data/MATH/train.parquet \ + data.val_files=data/MATH/test.parquet \ + data.train_batch_size=256 \ + data.max_prompt_length=$MAX_PROMPT_LENGTH \ + data.max_response_length=$MAX_NEW_TOKENS \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=$MODEL_NAME \ + actor_rollout_ref.actor.optim.lr=$LR \ + actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-sum-norm \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=$GROUP_SIZE \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.rollout.temperature=1.0 \ + actor_rollout_ref.rollout.top_p=1.0 \ + algorithm.use_kl_in_reward=False \ + algorithm.norm_adv_by_std_in_grpo=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.n_gpus_per_node=1 \ + trainer.log_val_generations=$LOG_N_VAL_GENERATIONS \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=1 \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$RUN_NAME \ + trainer.total_epochs=$TOTAL_EPOCHS