训练脚本配置如下:
`
DATA=(
data.train_files=${train_path}
data.val_files=${test_path}
data.train_batch_size=32
data.max_prompt_length=4096
data.max_response_length=512
data.truncation='error'
data.filter_overlong_prompts=True
+data.apply_chat_template_kwargs.enable_thinking=False
)
MODEL=(
actor_rollout_ref.model.path=${HF_MODEL_PATH}
actor_rollout_ref.model.trust_remote_code=True
actor_rollout_ref.model.use_remove_padding=False
)
ACTOR=(
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.freeze_vision_tower=True
actor_rollout_ref.actor.ppo_mini_batch_size=16
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096
actor_rollout_ref.actor.use_dynamic_bsz=False
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.kl_loss_coef=0.01
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.clip_ratio_high=0.28
actor_rollout_ref.actor.clip_ratio_low=0.2
actor_rollout_ref.actor.entropy_coeff=0.01
actor_rollout_ref.actor.megatron.use_mbridge=True
actor_rollout_ref.actor.megatron.vanilla_mbridge=True
actor_rollout_ref.actor.megatron.use_remove_padding=False
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.dtype=bfloat16
++actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
# ===== Checkpoint 内存优化配置 =====
# 只保存模型权重,不保存优化器状态,大幅减少内存占用
# actor_rollout_ref.actor.checkpoint.save_contents="['model']"
# 启用异步保存,避免阻塞训练和峰值内存压力
actor_rollout_ref.actor.checkpoint.async_save=True
# mbridge 内存高效模式(需要分布式文件系统支持,如 NFS/HDFS)
+actor_rollout_ref.actor.checkpoint.mbridge_config.distributed_filesystem=True
+actor_rollout_ref.actor.checkpoint.mbridge_config.memory_efficient=True
)
ROLLOUT=(
actor_rollout_ref.rollout.name=${rollout_name}
actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
actor_rollout_ref.rollout.gpu_memory_utilization=0.5
actor_rollout_ref.rollout.n=8
actor_rollout_ref.rollout.mode=async
actor_rollout_ref.rollout.dtype=bfloat16
# Qwen3.5 GDN linear attention is incompatible with vLLM CUDA graphs
actor_rollout_ref.rollout.enforce_eager=True
# ===== log_prob 计算配置 =====
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.rollout.max_model_len=4608
)
REF=(
# ===== ref log_prob 计算配置 =====
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
)
ALGORITHM=(
algorithm.adv_estimator=${adv_estimator}
algorithm.use_kl_in_reward=False
)
TRAINER=(
trainer.critic_warmup=0
trainer.logger='["console","tensorboard"]'
trainer.project_name=${project_name}
trainer.experiment_name=${exp_name}
trainer.n_gpus_per_node=8
trainer.nnodes=1
trainer.save_freq=10
trainer.max_actor_ckpt_to_keep=2
trainer.val_before_train=False
trainer.test_freq=-1
trainer.total_epochs=2
trainer.default_local_dir=${SAVE_DIR}
+trainer.tensorboard_dir=${SAVE_DIR}/logs/tensorboard
)
REWARD=(
custom_reward_function.path=$CUSTOM_REWARD
custom_reward_function.name="my_reward_fn"
)
########################### Launch ###########################
python3 -m verl.trainer.main_ppo
--config-path=config
--config-name='ppo_megatron_trainer.yaml'
"${DATA[@]}"
"${ALGORITHM[@]}"
"${MODEL[@]}"
"${ROLLOUT[@]}"
"${ACTOR[@]}"
"${REF[@]}"
"${TRAINER[@]}"
"${REWARD[@]}"
"$@" 2>&1 | tee ${SAVE_DIR}/training.log
`
训练曲线:
添加了熵正则也不起作用,想问问有没有成功训练Qwen3.5经验的大佬,有没有遇到过这个问题。
训练脚本配置如下:
`
DATA=(
data.train_files=${train_path}
data.val_files=${test_path}
data.train_batch_size=32
data.max_prompt_length=4096
data.max_response_length=512
data.truncation='error'
data.filter_overlong_prompts=True
+data.apply_chat_template_kwargs.enable_thinking=False
)
MODEL=(
actor_rollout_ref.model.path=${HF_MODEL_PATH}
actor_rollout_ref.model.trust_remote_code=True
actor_rollout_ref.model.use_remove_padding=False
)
ACTOR=(
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.freeze_vision_tower=True
actor_rollout_ref.actor.ppo_mini_batch_size=16
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096
actor_rollout_ref.actor.use_dynamic_bsz=False
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.kl_loss_coef=0.01
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.clip_ratio_high=0.28
actor_rollout_ref.actor.clip_ratio_low=0.2
actor_rollout_ref.actor.entropy_coeff=0.01
actor_rollout_ref.actor.megatron.use_mbridge=True
actor_rollout_ref.actor.megatron.vanilla_mbridge=True
actor_rollout_ref.actor.megatron.use_remove_padding=False
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.dtype=bfloat16
++actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
# ===== Checkpoint 内存优化配置 =====
# 只保存模型权重,不保存优化器状态,大幅减少内存占用
# actor_rollout_ref.actor.checkpoint.save_contents="['model']"
# 启用异步保存,避免阻塞训练和峰值内存压力
actor_rollout_ref.actor.checkpoint.async_save=True
# mbridge 内存高效模式(需要分布式文件系统支持,如 NFS/HDFS)
+actor_rollout_ref.actor.checkpoint.mbridge_config.distributed_filesystem=True
+actor_rollout_ref.actor.checkpoint.mbridge_config.memory_efficient=True
)
ROLLOUT=(
actor_rollout_ref.rollout.name=${rollout_name}
actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
actor_rollout_ref.rollout.gpu_memory_utilization=0.5
actor_rollout_ref.rollout.n=8
actor_rollout_ref.rollout.mode=async
actor_rollout_ref.rollout.dtype=bfloat16
# Qwen3.5 GDN linear attention is incompatible with vLLM CUDA graphs
actor_rollout_ref.rollout.enforce_eager=True
# ===== log_prob 计算配置 =====
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.rollout.max_model_len=4608
)
REF=(
# ===== ref log_prob 计算配置 =====
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
)
ALGORITHM=(
algorithm.adv_estimator=${adv_estimator}
algorithm.use_kl_in_reward=False
)
TRAINER=(
trainer.critic_warmup=0
trainer.logger='["console","tensorboard"]'
trainer.project_name=${project_name}
trainer.experiment_name=${exp_name}
trainer.n_gpus_per_node=8
trainer.nnodes=1
trainer.save_freq=10
trainer.max_actor_ckpt_to_keep=2
trainer.val_before_train=False
trainer.test_freq=-1
trainer.total_epochs=2
trainer.default_local_dir=${SAVE_DIR}
+trainer.tensorboard_dir=${SAVE_DIR}/logs/tensorboard
)
REWARD=(
custom_reward_function.path=$CUSTOM_REWARD
custom_reward_function.name="my_reward_fn"
)
########################### Launch ###########################
python3 -m verl.trainer.main_ppo
--config-path=config
--config-name='ppo_megatron_trainer.yaml'
"${DATA[@]}"
"${ALGORITHM[@]}"
"${MODEL[@]}"
"${ROLLOUT[@]}"
"${ACTOR[@]}"
"${REF[@]}"
"${TRAINER[@]}"
"${REWARD[@]}"
"$@" 2>&1 | tee ${SAVE_DIR}/training.log
`
训练曲线:
添加了熵正则也不起作用,想问问有没有成功训练Qwen3.5经验的大佬,有没有遇到过这个问题。