From a0c4b4ccda18bfc96d19bdf32648fef18fdd295b Mon Sep 17 00:00:00 2001
From: blacksheep-Aristotle <zhangweilong01@baidu.com>
Date: Thu, 26 Dec 2024 18:59:15 +0800
Subject: [PATCH] update gpt&baichuan&qwen ce name (#9697)

* update gpt&baichuan&qwen ce name

* update gpt&baichuan&qwen ce name

* update gpt&baichuan&qwen ce name

* update gpt&baichuan&qwen ce name

* update gpt&baichuan&qwen ce name
---
 ..._pretrain_bs128_bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh} | 2 +-
 ...ain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh} | 4 ++--
 .../pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json | 4 ++--
 ...ain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh} | 2 +-
 .../gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json      | 4 ++--
 .../pretrain_config_llama2_70b/pretrain-llama2_70b.json       | 2 +-
 ...rain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh | 2 +-
 .../qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json      | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)
 rename tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/{gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh => gpt-3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh} (96%)
 rename tests/test_tipc/static/auto_parallel/baichuan2/N4C32/{meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh => baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh} (89%)
 rename tests/test_tipc/static/auto_parallel/gpt3/N4C32/{gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh => gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh} (95%)

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt-3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh
similarity index 96%
rename from tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh
rename to tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt-3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh
index a0e988a50cb5..60a09cda3c51 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt-3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-param="model_item=gpt3-13b_pretrain "
+param="model_item=gpt-3-13b_pretrain "
 param+="run_mode=DP1_MP2_PP4_VPP5_Sharding4_Stage1 "
 param+="device_num=N4C32 "
 param+="global_batch_size=128 "
diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh
similarity index 89%
rename from tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh
rename to tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh
index 7fa6b08e795b..07cd78f92054 100644
--- a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh
+++ b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-param="model_item=baichuan-inc-baichaun-2-13b_pretrain "
-param+="run_mode=DP1_MP2_PP4_1F1B_Sharding8_Stage2 "
+param="model_item=baichuan-inc-baichuan-2-13b_pretrain "
+param+="run_mode=DP1_MP4_PP2_1F1B_Sharding4_Stage1 "
 param+="device_num=N4C32 "
 param+="global_batch_size=128 "
 param+="nnodes=4 "
diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json b/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json
index 5ef40643865b..c1f36c5a0ff8 100644
--- a/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json
+++ b/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json
@@ -44,12 +44,12 @@
     "fused_linear": 1,
     "fused_linear_param_grad_add": 1,
     "use_fused_rope": true,
-    "use_fused_rms_norm": false,
+    "use_fused_rms_norm": true,
     "max_seq_length": 4096,
     "sequence_parallel": false,
     "sharding": "stage1",
     "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap",
-    "tensor_parallel_config": "enable_mp_async_allreduce",
+    "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"
 }
\ No newline at end of file
diff --git a/tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh
similarity index 95%
rename from tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh
rename to tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh
index 2ab99d620712..103632adc2ef 100644
--- a/tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh
+++ b/tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-param="model_item=gpt3-13b_pretrain_dy2st "
+param="model_item=gpt-3-13b_pretrain_dy2st "
 param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 "
 param+="device_num=N4C32 "
 param+="global_batch_size=128 "
diff --git a/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json
index fdcf2b9580be..6f6572c5e0b4 100644
--- a/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json
+++ b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json
@@ -39,7 +39,7 @@
     "fuse_attention_qkv": 1,
     "fused_linear_param_grad_add": 1,
     "use_fused_rope": true,
-    "use_fused_rms_norm": false,
+    "use_fused_rms_norm": true,
     "recompute": 0,
     "recompute_use_reentrant": true,
     "recompute_granularity": "full",
@@ -52,7 +52,7 @@
     "attention_probs_dropout_prob": 0.1,
     "hidden_dropout_prob": 0.1,
     "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap",
-    "tensor_parallel_config": "enable_mp_async_allreduce",
+    "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"
 }
\ No newline at end of file
diff --git a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json
index 8ec06780d111..3c8faf175b3b 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json
+++ b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json
@@ -53,7 +53,7 @@
     "pipeline_schedule_mode": "VPP", 
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "sharding_parallel_config": "enable_stage1_overlap",
-    "tensor_parallel_config": "enable_mp_async_allreduce",
+    "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "max_seq_length": 4096,
     "to_static": true,
     "eliminate_transpose": 1,
diff --git a/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh
index a52d8a10e34a..50d990884957 100644
--- a/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh
+++ b/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-param="model_item=qwen-2-14b_pretrain_dy2st "
+param="model_item=qwen-14b_pretrain_dy2st "
 param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 "
 param+="device_num=N4C32 "
 param+="global_batch_size=128 "
diff --git a/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json b/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json
index 9e8bacb911af..19ae4ffaf0c6 100644
--- a/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json
+++ b/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json
@@ -50,6 +50,6 @@
     "auto_parallel_resume_form_hybrid_parallel": true,
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "sharding_parallel_config": "enable_stage1_overlap",
-    "tensor_parallel_config": "enable_mp_async_allreduce",
+    "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"
 }
\ No newline at end of file