From c174516f751ead8447169ea64a36421e1fbd0912 Mon Sep 17 00:00:00 2001
From: XieYunshen <1084314248@qq.com>
Date: Mon, 25 Nov 2024 20:35:16 +0800
Subject: [PATCH 1/3] =?UTF-8?q?CE=20case=E6=89=A7=E8=A1=8C=E7=A7=BB?=
 =?UTF-8?q?=E9=99=A4CUDA=5FDEVICE=5FMAX=5FCONNECTIONS=E7=8E=AF=E5=A2=83?=
 =?UTF-8?q?=E5=8F=98=E9=87=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../auto_parallel/llama2/benchmark_common/run_benchmark.sh      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
index b74b3a9df2e2..b50b9b78015a 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
@@ -229,7 +229,7 @@ export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
 # 如不设置参数为1,则默认选择不带tensor fusion的sharding stage1版本
 export FLAGS_enable_sharding_stage1_tensor_fusion=1
 
-export CUDA_DEVICE_MAX_CONNECTIONS=1
+# export CUDA_DEVICE_MAX_CONNECTIONS=1
 export PARALLEL_CROSS_ENTROPY=true
 
 source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开

From dfbacb9aeb6e861a725a0f6d99e308c6f113c7be Mon Sep 17 00:00:00 2001
From: XieYunshen <1084314248@qq.com>
Date: Tue, 26 Nov 2024 15:09:26 +0800
Subject: [PATCH 2/3] =?UTF-8?q?=E4=BC=98=E5=8C=96benchmark=E6=89=A7?=
 =?UTF-8?q?=E8=A1=8C=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../llama2/benchmark_common/run_benchmark.sh  | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
index b50b9b78015a..eb22a20708dd 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
@@ -63,8 +63,9 @@ monitor_log_file() {
     local training_pid="$2"  # 获取训练进程的 PID
     local no_update_duration=0  # 初始化无更新时长计数
     local last_size=0
+    local kill_flag_file="/tmp/monitor_killed_$training_pid"
 
-    echo "开始监控进程 $training_pid 和日志文件 $log_file..."
+    echo "$(date '+%Y-%m-%d %H:%M:%S') 开始监控进程 $training_pid 和日志文件 $log_file..."
 
     while true; do
         sleep 5  # 每隔 5 秒检查一次日志文件
@@ -74,7 +75,7 @@ monitor_log_file() {
             echo "日志文件 $log_file 不存在，检查进程状态..."
             # 如果日志文件不存在，直接判断进程是否结束
             if ! ps -p $training_pid > /dev/null; then
-                echo "进程 $training_pid 已经结束。"
+                echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。"
                 break
             fi
             continue  # 如果文件不存在，跳过后续逻辑，继续循环
@@ -86,23 +87,26 @@ monitor_log_file() {
         if [ "$last_size" -eq "$new_size" ]; then
             # 文件大小未变化，增加无更新时长计数
             no_update_duration=$((no_update_duration + 5))
-
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 文件未写入..."
             if [ "$no_update_duration" -ge 180 ]; then
-                echo "文件在过去的 3 分钟内没有继续写入，准备杀掉进程 $training_pid."
+                echo "$(date '+%Y-%m-%d %H:%M:%S') 文件在过去的 3 分钟内没有继续写入，准备杀掉进程 $training_pid."
+                # 创建标志文件
+                touch "$kill_flag_file"
+                ls -l "$kill_flag_file"
                 kill -9 $training_pid  # 杀掉进程
-                echo "进程 $training_pid 已经被杀掉。"
+                echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经被杀掉。"
                 break
             fi
         else
             # 文件大小有变化，重置无更新时长计数
-            echo "文件仍在写入..."
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 文件仍在写入..."
             no_update_duration=0
             last_size=$new_size
         fi
 
         # 如果训练进程已经结束，退出监控
         if ! ps -p $training_pid > /dev/null; then
-            echo "进程 $training_pid 已经结束。"
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。"
             break
         fi
     done
@@ -211,6 +215,16 @@ function _train(){
 
     if [ ${exit_code} -ne 0 ];then
         echo -e "${model_name}, FAIL"
+        # 如果程序是主动报错退出，不是monitor_log_file函数kill掉的情况下，需要等待其它机器被kill
+        # 标志文件位置
+        kill_flag_file="/tmp/monitor_killed_$training_pid"
+        if [ -f "$kill_flag_file" ]; then
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是被 monitor_log_file 函数杀掉的。"
+            rm -f "$kill_flag_file"  # 清理标志文件
+        else
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是主动报错退出的。"
+            sleep 120
+        fi
     else
         echo -e "${model_name}, SUCCESS"
     fi

From 1d66a45f5da24374a731a2f24cccadcf28026b6d Mon Sep 17 00:00:00 2001
From: XieYunshen <1084314248@qq.com>
Date: Tue, 26 Nov 2024 19:26:52 +0800
Subject: [PATCH 3/3] update

---
 .../auto_parallel/llama2/benchmark_common/run_benchmark.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
index eb22a20708dd..88b326057402 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
@@ -130,8 +130,8 @@ function _train(){
         log_file=${train_log_file}
     fi
     
-    # 70b需要关闭这个开关，否则会hang
-    if [[ "${MODEL_TYPE}" =~ "70b" ]]; then
+    # 70b和7b需要关闭这个开关
+    if [[ "${MODEL_TYPE}" =~ "70b" || "${MODEL_TYPE}" =~ "7b" ]]; then
         unset CUDA_DEVICE_MAX_CONNECTIONS
     fi
     # Disable for hanging bug
@@ -243,7 +243,8 @@ export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
 # 如不设置参数为1,则默认选择不带tensor fusion的sharding stage1版本
 export FLAGS_enable_sharding_stage1_tensor_fusion=1
 
-# export CUDA_DEVICE_MAX_CONNECTIONS=1
+# 只有13b的任务需要打开CUDA_DEVICE_MAX_CONNECTIONS,7b与13b关闭
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 export PARALLEL_CROSS_ENTROPY=true
 
 source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开