diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt-3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh similarity index 96% rename from tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh rename to tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt-3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh index a0e988a50cb5..60a09cda3c51 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt-3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -param="model_item=gpt3-13b_pretrain " +param="model_item=gpt-3-13b_pretrain " param+="run_mode=DP1_MP2_PP4_VPP5_Sharding4_Stage1 " param+="device_num=N4C32 " param+="global_batch_size=128 " diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh similarity index 89% rename from tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh rename to tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh index 7fa6b08e795b..07cd78f92054 100644 --- a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh +++ b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -param="model_item=baichuan-inc-baichaun-2-13b_pretrain " -param+="run_mode=DP1_MP2_PP4_1F1B_Sharding8_Stage2 " +param="model_item=baichuan-inc-baichuan-2-13b_pretrain " +param+="run_mode=DP1_MP4_PP2_1F1B_Sharding4_Stage1 " param+="device_num=N4C32 " param+="global_batch_size=128 " param+="nnodes=4 " diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json b/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json index 5ef40643865b..c1f36c5a0ff8 100644 --- a/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json +++ b/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json @@ -44,12 +44,12 @@ "fused_linear": 1, "fused_linear_param_grad_add": 1, "use_fused_rope": true, - "use_fused_rms_norm": false, + "use_fused_rms_norm": true, "max_seq_length": 4096, "sequence_parallel": false, "sharding": "stage1", "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap", - "tensor_parallel_config": "enable_mp_async_allreduce", + "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy", "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward" } \ No newline at end of file diff --git a/tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh similarity index 95% rename from tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh rename to tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh index 2ab99d620712..103632adc2ef 100644 --- a/tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh +++ b/tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -param="model_item=gpt3-13b_pretrain_dy2st " +param="model_item=gpt-3-13b_pretrain_dy2st " param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 " param+="device_num=N4C32 " param+="global_batch_size=128 " diff --git a/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json index fdcf2b9580be..6f6572c5e0b4 100644 --- a/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json +++ b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json @@ -39,7 +39,7 @@ "fuse_attention_qkv": 1, "fused_linear_param_grad_add": 1, "use_fused_rope": true, - "use_fused_rms_norm": false, + "use_fused_rms_norm": true, "recompute": 0, "recompute_use_reentrant": true, "recompute_granularity": "full", @@ -52,7 +52,7 @@ "attention_probs_dropout_prob": 0.1, "hidden_dropout_prob": 0.1, "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap", - "tensor_parallel_config": "enable_mp_async_allreduce", + "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy", "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward" } \ No newline at end of file diff --git a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json index 8ec06780d111..3c8faf175b3b 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json +++ b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json @@ -53,7 +53,7 @@ "pipeline_schedule_mode": "VPP", "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", "sharding_parallel_config": "enable_stage1_overlap", - "tensor_parallel_config": "enable_mp_async_allreduce", + "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy", "max_seq_length": 4096, "to_static": true, "eliminate_transpose": 1, diff --git a/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh index a52d8a10e34a..50d990884957 100644 --- a/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh +++ b/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -param="model_item=qwen-2-14b_pretrain_dy2st " +param="model_item=qwen-14b_pretrain_dy2st " param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 " param+="device_num=N4C32 " param+="global_batch_size=128 " diff --git a/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json b/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json index 9e8bacb911af..19ae4ffaf0c6 100644 --- a/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json +++ b/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json @@ -50,6 +50,6 @@ "auto_parallel_resume_form_hybrid_parallel": true, "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", "sharding_parallel_config": "enable_stage1_overlap", - "tensor_parallel_config": "enable_mp_async_allreduce", + "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy", "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward" } \ No newline at end of file