From c174516f751ead8447169ea64a36421e1fbd0912 Mon Sep 17 00:00:00 2001 From: XieYunshen <1084314248@qq.com> Date: Mon, 25 Nov 2024 20:35:16 +0800 Subject: [PATCH 1/3] =?UTF-8?q?CE=20case=E6=89=A7=E8=A1=8C=E7=A7=BB?= =?UTF-8?q?=E9=99=A4CUDA=5FDEVICE=5FMAX=5FCONNECTIONS=E7=8E=AF=E5=A2=83?= =?UTF-8?q?=E5=8F=98=E9=87=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../auto_parallel/llama2/benchmark_common/run_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh index b74b3a9df2e2..b50b9b78015a 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh @@ -229,7 +229,7 @@ export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH # 如不设置参数为1,则默认选择不带tensor fusion的sharding stage1版本 export FLAGS_enable_sharding_stage1_tensor_fusion=1 -export CUDA_DEVICE_MAX_CONNECTIONS=1 +# export CUDA_DEVICE_MAX_CONNECTIONS=1 export PARALLEL_CROSS_ENTROPY=true source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 From dfbacb9aeb6e861a725a0f6d99e308c6f113c7be Mon Sep 17 00:00:00 2001 From: XieYunshen <1084314248@qq.com> Date: Tue, 26 Nov 2024 15:09:26 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E4=BC=98=E5=8C=96benchmark=E6=89=A7?= =?UTF-8?q?=E8=A1=8C=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../llama2/benchmark_common/run_benchmark.sh | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh index b50b9b78015a..eb22a20708dd 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh @@ -63,8 +63,9 @@ monitor_log_file() { local training_pid="$2" # 获取训练进程的 PID local no_update_duration=0 # 初始化无更新时长计数 local last_size=0 + local kill_flag_file="/tmp/monitor_killed_$training_pid" - echo "开始监控进程 $training_pid 和日志文件 $log_file..." + echo "$(date '+%Y-%m-%d %H:%M:%S') 开始监控进程 $training_pid 和日志文件 $log_file..." while true; do sleep 5 # 每隔 5 秒检查一次日志文件 @@ -74,7 +75,7 @@ monitor_log_file() { echo "日志文件 $log_file 不存在,检查进程状态..." # 如果日志文件不存在,直接判断进程是否结束 if ! ps -p $training_pid > /dev/null; then - echo "进程 $training_pid 已经结束。" + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" break fi continue # 如果文件不存在,跳过后续逻辑,继续循环 @@ -86,23 +87,26 @@ monitor_log_file() { if [ "$last_size" -eq "$new_size" ]; then # 文件大小未变化,增加无更新时长计数 no_update_duration=$((no_update_duration + 5)) - + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件未写入..." if [ "$no_update_duration" -ge 180 ]; then - echo "文件在过去的 3 分钟内没有继续写入,准备杀掉进程 $training_pid." + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件在过去的 3 分钟内没有继续写入,准备杀掉进程 $training_pid." + # 创建标志文件 + touch "$kill_flag_file" + ls -l "$kill_flag_file" kill -9 $training_pid # 杀掉进程 - echo "进程 $training_pid 已经被杀掉。" + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经被杀掉。" break fi else # 文件大小有变化,重置无更新时长计数 - echo "文件仍在写入..." + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件仍在写入..." no_update_duration=0 last_size=$new_size fi # 如果训练进程已经结束,退出监控 if ! ps -p $training_pid > /dev/null; then - echo "进程 $training_pid 已经结束。" + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" break fi done @@ -211,6 +215,16 @@ function _train(){ if [ ${exit_code} -ne 0 ];then echo -e "${model_name}, FAIL" + # 如果程序是主动报错退出,不是monitor_log_file函数kill掉的情况下,需要等待其它机器被kill + # 标志文件位置 + kill_flag_file="/tmp/monitor_killed_$training_pid" + if [ -f "$kill_flag_file" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是被 monitor_log_file 函数杀掉的。" + rm -f "$kill_flag_file" # 清理标志文件 + else + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是主动报错退出的。" + sleep 120 + fi else echo -e "${model_name}, SUCCESS" fi From 1d66a45f5da24374a731a2f24cccadcf28026b6d Mon Sep 17 00:00:00 2001 From: XieYunshen <1084314248@qq.com> Date: Tue, 26 Nov 2024 19:26:52 +0800 Subject: [PATCH 3/3] update --- .../auto_parallel/llama2/benchmark_common/run_benchmark.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh index eb22a20708dd..88b326057402 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh @@ -130,8 +130,8 @@ function _train(){ log_file=${train_log_file} fi - # 70b需要关闭这个开关,否则会hang - if [[ "${MODEL_TYPE}" =~ "70b" ]]; then + # 70b和7b需要关闭这个开关 + if [[ "${MODEL_TYPE}" =~ "70b" || "${MODEL_TYPE}" =~ "7b" ]]; then unset CUDA_DEVICE_MAX_CONNECTIONS fi # Disable for hanging bug @@ -243,7 +243,8 @@ export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH # 如不设置参数为1,则默认选择不带tensor fusion的sharding stage1版本 export FLAGS_enable_sharding_stage1_tensor_fusion=1 -# export CUDA_DEVICE_MAX_CONNECTIONS=1 +# 只有13b的任务需要打开CUDA_DEVICE_MAX_CONNECTIONS,7b与13b关闭 +export CUDA_DEVICE_MAX_CONNECTIONS=1 export PARALLEL_CROSS_ENTROPY=true source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开