From 79b2287d5bfb6efb64a900e587dd41c9186ae368 Mon Sep 17 00:00:00 2001 From: lzzyzlbb <287246233@qq.com> Date: Fri, 5 Nov 2021 13:59:10 +0800 Subject: [PATCH] Add profile (#463) * fix benchmark * Add profile --- benchmark/README.md | 7 +++++++ benchmark/run_all.sh | 7 ++++--- benchmark/run_benchmark.sh | 11 ++++++++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/benchmark/README.md b/benchmark/README.md index 376b403c54763..ae66deefeec50 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -42,6 +42,13 @@ nvidia-docker run --name test_paddlegan -i \ ${ImageName} /bin/bash -c "${run_cmd}" ``` +如果需要打开profile选项,可以直接替换`run_cmd`为: +``` +run_cmd="set -xe; + cd /workspace ; + bash -x benchmark/run_all.sh on" +``` + ## 输出 执行完成后,在PaddleGAN目录会产出模型训练性能数据的文件,比如`esrgan_mp_bs32_fp32_8`等文件。 diff --git a/benchmark/run_all.sh b/benchmark/run_all.sh index 92918c643df00..fab5e86af70fa 100755 --- a/benchmark/run_all.sh +++ b/benchmark/run_all.sh @@ -53,6 +53,7 @@ function parse_yaml { } eval $(parse_yaml "benchmark/benchmark.yaml") +profile=${1:-"off"} for model_mode in ${model_mode_list[@]}; do eval fp_item_list='$'"${model_mode}_fp_item" @@ -82,15 +83,15 @@ for model_mode in ${model_mode_list[@]}; do do echo "index is speed, 1gpus, begin, ${model_name}" run_mode=sp - CUDA_VISIBLE_DEVICES=0 benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${mode} ${max_iter} ${model_mode} ${config} ${log_interval} # (5min) + CUDA_VISIBLE_DEVICES=0 benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${mode} ${max_iter} ${model_mode} ${config} ${log_interval} ${profile} # (5min) sleep 60 echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" run_mode=mp basicvsr_name=basicvsr if [ ${model_mode} = ${basicvsr_name} ]; then - CUDA_VISIBLE_DEVICES=0,1,2,3 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${mode} ${max_iter} ${model_mode} ${config} ${log_interval} + CUDA_VISIBLE_DEVICES=0,1,2,3 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${mode} ${max_iter} ${model_mode} ${config} ${log_interval} ${profile} else - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${mode} ${max_iter} ${model_mode} ${config} ${log_interval} + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${mode} ${max_iter} ${model_mode} ${config} ${log_interval} ${profile} fi sleep 60 done diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh index 2c0bcb95859ee..25d9980512378 100755 --- a/benchmark/run_benchmark.sh +++ b/benchmark/run_benchmark.sh @@ -12,6 +12,7 @@ function _set_params(){ config=${7:-"config"} log_interval=${8:-"1"} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 + need_profile=${9:-"off"} # 以下不用修改 device=${CUDA_VISIBLE_DEVICES//,/ } @@ -19,6 +20,7 @@ function _set_params(){ num_gpu_devices=${#arr[*]} log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices} res_log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}_speed + log_profile=${run_log_path}/${model_name}_model.profile } function _analysis_log(){ @@ -29,7 +31,14 @@ function _train(){ echo "Train on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" - train_cmd="--config-file=${config} + profiler_cmd="" + profiler_options="batch_range=[10,20];profile_path=${log_profile}" + if [ $need_profile = "on" ]; then + profiler_cmd="--profiler_options=${profiler_options}" + fi + + train_cmd="${profiler_cmd} + --config-file=${config} -o dataset.train.batch_size=${batch_size} log_config.interval=${log_interval} ${mode}=${max_iter} "