Skip to content

训练Qwen3.5-35B-A3B出现熵坍塌的问题 #5953

@panjicaiz

Description

@panjicaiz

训练脚本配置如下:
`
DATA=(
data.train_files=${train_path}
data.val_files=${test_path}
data.train_batch_size=32
data.max_prompt_length=4096
data.max_response_length=512
data.truncation='error'
data.filter_overlong_prompts=True
+data.apply_chat_template_kwargs.enable_thinking=False
)

MODEL=(
actor_rollout_ref.model.path=${HF_MODEL_PATH}
actor_rollout_ref.model.trust_remote_code=True
actor_rollout_ref.model.use_remove_padding=False
)

ACTOR=(
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.freeze_vision_tower=True
actor_rollout_ref.actor.ppo_mini_batch_size=16
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096
actor_rollout_ref.actor.use_dynamic_bsz=False
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.kl_loss_coef=0.01
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.clip_ratio_high=0.28
actor_rollout_ref.actor.clip_ratio_low=0.2
actor_rollout_ref.actor.entropy_coeff=0.01
actor_rollout_ref.actor.megatron.use_mbridge=True
actor_rollout_ref.actor.megatron.vanilla_mbridge=True
actor_rollout_ref.actor.megatron.use_remove_padding=False
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.dtype=bfloat16
++actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
# ===== Checkpoint 内存优化配置 =====
# 只保存模型权重,不保存优化器状态,大幅减少内存占用
# actor_rollout_ref.actor.checkpoint.save_contents="['model']"
# 启用异步保存,避免阻塞训练和峰值内存压力
actor_rollout_ref.actor.checkpoint.async_save=True
# mbridge 内存高效模式(需要分布式文件系统支持,如 NFS/HDFS)
+actor_rollout_ref.actor.checkpoint.mbridge_config.distributed_filesystem=True
+actor_rollout_ref.actor.checkpoint.mbridge_config.memory_efficient=True
)

ROLLOUT=(
actor_rollout_ref.rollout.name=${rollout_name}
actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
actor_rollout_ref.rollout.gpu_memory_utilization=0.5
actor_rollout_ref.rollout.n=8
actor_rollout_ref.rollout.mode=async
actor_rollout_ref.rollout.dtype=bfloat16
# Qwen3.5 GDN linear attention is incompatible with vLLM CUDA graphs
actor_rollout_ref.rollout.enforce_eager=True
# ===== log_prob 计算配置 =====
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.rollout.max_model_len=4608
)

REF=(
# ===== ref log_prob 计算配置 =====
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
)

ALGORITHM=(
algorithm.adv_estimator=${adv_estimator}
algorithm.use_kl_in_reward=False
)

TRAINER=(
trainer.critic_warmup=0
trainer.logger='["console","tensorboard"]'
trainer.project_name=${project_name}
trainer.experiment_name=${exp_name}
trainer.n_gpus_per_node=8
trainer.nnodes=1
trainer.save_freq=10
trainer.max_actor_ckpt_to_keep=2
trainer.val_before_train=False
trainer.test_freq=-1
trainer.total_epochs=2
trainer.default_local_dir=${SAVE_DIR}
+trainer.tensorboard_dir=${SAVE_DIR}/logs/tensorboard
)

REWARD=(
custom_reward_function.path=$CUSTOM_REWARD
custom_reward_function.name="my_reward_fn"
)

########################### Launch ###########################

python3 -m verl.trainer.main_ppo
--config-path=config
--config-name='ppo_megatron_trainer.yaml'
"${DATA[@]}"
"${ALGORITHM[@]}"
"${MODEL[@]}"
"${ROLLOUT[@]}"
"${ACTOR[@]}"
"${REF[@]}"
"${TRAINER[@]}"
"${REWARD[@]}"
"$@" 2>&1 | tee ${SAVE_DIR}/training.log
`

训练曲线:

Image

添加了熵正则也不起作用,想问问有没有成功训练Qwen3.5经验的大佬,有没有遇到过这个问题。

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions