File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -156,8 +156,8 @@ false \
156156sel \
1571571.0 \
158158100000 \
159- /mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
160- /mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
159+ /mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
160+ /mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
161161/mnt/deepseek-ckpts/DeepSeek-V3-to-mcore-tp8-pp8-ep16 \
1621621000000000 \
16316310000 \
@@ -192,8 +192,8 @@ true \
192192sel \
1931931.0 \
194194100000 \
195- /mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
196- /mnt/deepseek-datasets/mmap_deepseekv3_datasets_text_document \
195+ /mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
196+ /mnt/deepseek-datasets/mmap_deepseekv2_datasets_text_document \
197197/mnt/deepseek-ckpts/DeepSeek-V3-to-mcore-tp8-pp8-ep16 \
19819810000 \
199199100 \
Original file line number Diff line number Diff line change @@ -76,7 +76,7 @@ NUM_ATTENTION_HEADS=128
7676NUM_LAYERS=61
7777INTERMEDIATE_SIZE=18432
7878MOE_INTERMEDIATE_SIZE=2048
79- MAX_POSITION_EMBEDDINGS=163840
79+ MAX_POSITION_EMBEDDINGS=${SEQ_LEN}
8080EXTRA_VOCAB_SIZE=467
8181Q_LORA_RANK=1536
8282KV_LORA_RANK=512
@@ -106,6 +106,8 @@ moe_options=" \
106106 --moe-router-enable-expert-bias \
107107 --mscale 1.0 \
108108 --mscale-all-dim 1.0 \
109+ --moe-token-drop-policy probs \
110+ --moe-router-pre-softmax \
109111 --moe-router-score-function sigmoid \
110112 --moe-router-bias-update-rate 0.001 \
111113 --moe-aux-loss-coeff 0.001 \
Original file line number Diff line number Diff line change @@ -541,9 +541,9 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples):
541541 EnergonDataloader (get_loader (valid_ds , worker_config = worker_config ))
542542 for valid_ds in valid_ds1
543543 ]
544- test_dataloader = None # NOTE: no test
544+ test_dataloader = None
545545
546- return EnergonDataloader (train_dataloader ), valid_dataloader , None
546+ return EnergonDataloader (train_dataloader ), valid_dataloader , EnergonDataloader ( test_dataloader )
547547
548548
549549class EnergonDataloader :
You can’t perform that action at this time.
0 commit comments