From 31de84eb3ba3cb9540b9eff1b883ec76cfb64376 Mon Sep 17 00:00:00 2001 From: Lvjinhong <96970081+Lvjinhong@users.noreply.github.com> Date: Thu, 16 Nov 2023 13:46:15 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=B9=B6=E5=AE=8C=E5=96=84?= =?UTF-8?q?=E5=88=86=E5=B8=83=E5=BC=8F=E8=AE=AD=E7=BB=83=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=20(#186)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 修复多机训练问题 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * 更新并完善分布式训练功能 近期融合V2版本代码时发现之前修改的多机功能并不正确,仍会报错,只不过单机多卡情况下local_rank即相当于rank,感知不出 1. 修复train_ms.py中DDP初始化及.cuda绑定到local_rank上 2. 在default_config.yml配置文件中添加env变量 LOCAL_RANK,否则默认情况下会key error 3. 添加run_MnodesAndMgpus.sh,更新分布式相关说明 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- default_config.yml | 1 + run_Mgpus.sh | 7 ----- run_MnodesAndMgpus.sh | 31 ++++++++++++++++++++ train_ms.py | 68 ++++++++++++++++++++++++++++++------------- 4 files changed, 80 insertions(+), 27 deletions(-) delete mode 100644 run_Mgpus.sh create mode 100644 run_MnodesAndMgpus.sh diff --git a/default_config.yml b/default_config.yml index bb7871eb..4f264d68 100644 --- a/default_config.yml +++ b/default_config.yml @@ -66,6 +66,7 @@ train_ms: MASTER_ADDR: "localhost" MASTER_PORT: 10086 WORLD_SIZE: 1 + LOCAL_RANK: 0 RANK: 0 # 可以填写任意名的环境变量 # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567" diff --git a/run_Mgpus.sh b/run_Mgpus.sh deleted file mode 100644 index c43974c0..00000000 --- a/run_Mgpus.sh +++ /dev/null @@ -1,7 +0,0 @@ -torchrun \ - --nnodes=1:3\ - --nproc_per_node=2\ - --rdzv_id=1\ - --rdzv_backend=c10d\ - --rdzv_endpoint="ib1:8880"\ - train_ms.py diff --git a/run_MnodesAndMgpus.sh b/run_MnodesAndMgpus.sh new file mode 100644 index 00000000..f89efc40 --- /dev/null +++ b/run_MnodesAndMgpus.sh @@ -0,0 +1,31 @@ +#多机多卡训练 + +#--nnodes=1:3 表示 使用一到三台机器 弹性分配资源 +#--nnodes=<最小节点数>:<最大节点数> +#--nproc_per_node=每台机器上可用的GPU数 +#--rdzv_endpoint=主节点(最先启动的)ip:端口号 +#其他不需要变 + +#注意: 此版本的分布式训练是基于数据并行的,多机多卡相当于开更大的batchsize,此时epoch迭代速度会增加, +#但由于 该版本的代码中 保存模型是按照global step来计算的,所以会出现的效果就是 : 保存模型的时间不会有明显加速, +#但每次保存模型时epoch都比之前迭代了更多次,也就是 “更少的步数,实现更好的效果” + +#************************* +# torchrun \ +# --nnodes=1:3\ +# --nproc_per_node=2\ +# --rdzv_id=1\ +# --rdzv_backend=c10d\ +# --rdzv_endpoint="inspur1:8880"\ +# train_ms.py +#**************************** + +#多卡训练 +#nproc_per_node = 机器上可用的GPU数 + +#************************* +torchrun \ + --nnodes=1\ + --nproc_per_node=2\ + train_ms.py +#************************* diff --git a/train_ms.py b/train_ms.py index 1bda2a58..30d98428 100644 --- a/train_ms.py +++ b/train_ms.py @@ -51,7 +51,17 @@ def run(): envs = config.train_ms_config.env for env_name, env_value in envs.items(): if env_name not in os.environ.keys(): + print("加载config中的配置{}".format(str(env_value))) os.environ[env_name] = str(env_value) + print( + "加载环境变量 \nMASTER_ADDR: {},\nMASTER_PORT: {},\nWORLD_SIZE: {},\nRANK: {},\nLOCAL_RANK: {}".format( + os.environ["MASTER_ADDR"], + os.environ["MASTER_PORT"], + os.environ["WORLD_SIZE"], + os.environ["RANK"], + os.environ["LOCAL_RANK"], + ) + ) # 多卡训练设置 backend = "nccl" @@ -162,7 +172,7 @@ def run(): 3, 0.1, gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0, - ).cuda(rank) + ).cuda(local_rank) if ( "use_spk_conditioned_encoder" in hps.model.keys() and hps.model.use_spk_conditioned_encoder is True @@ -182,9 +192,9 @@ def run(): mas_noise_scale_initial=mas_noise_scale_initial, noise_scale_delta=noise_scale_delta, **hps.model, - ).cuda(rank) + ).cuda(local_rank) - net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) + net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(local_rank) optim_g = torch.optim.AdamW( filter(lambda p: p.requires_grad, net_g.parameters()), hps.train.learning_rate, @@ -206,11 +216,13 @@ def run(): ) else: optim_dur_disc = None - net_g = DDP(net_g, device_ids=[rank]) - net_d = DDP(net_d, device_ids=[rank]) + net_g = DDP(net_g, device_ids=[local_rank]) + net_d = DDP(net_d, device_ids=[local_rank]) dur_resume_lr = None if net_dur_disc is not None: - net_dur_disc = DDP(net_dur_disc, device_ids=[rank], find_unused_parameters=True) + net_dur_disc = DDP( + net_dur_disc, device_ids=[local_rank], find_unused_parameters=True + ) # 下载底模 if config.train_ms_config.base["use_base_model"]: @@ -256,6 +268,9 @@ def run(): epoch_str = max(epoch_str, 1) global_step = (epoch_str - 1) * len(train_loader) + print( + f"******************检测到模型存在,epoch为 {epoch_str},gloabl step为 {global_step}*********************" + ) except Exception as e: print(e) epoch_str = 1 @@ -281,6 +296,7 @@ def run(): if rank == 0: train_and_evaluate( rank, + local_rank, epoch, hps, [net_g, net_d, net_dur_disc], @@ -294,6 +310,7 @@ def run(): else: train_and_evaluate( rank, + local_rank, epoch, hps, [net_g, net_d, net_dur_disc], @@ -311,7 +328,17 @@ def run(): def train_and_evaluate( - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers + rank, + local_rank, + epoch, + hps, + nets, + optims, + schedulers, + scaler, + loaders, + logger, + writers, ): net_g, net_d, net_dur_disc = nets optim_g, optim_d, optim_dur_disc = optims @@ -327,6 +354,7 @@ def train_and_evaluate( net_d.train() if net_dur_disc is not None: net_dur_disc.train() + for batch_idx, ( x, x_lengths, @@ -347,21 +375,21 @@ def train_and_evaluate( - net_g.module.noise_scale_delta * global_step ) net_g.module.current_mas_noise_scale = max(current_mas_noise_scale, 0.0) - x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda( - rank, non_blocking=True - ) - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( - rank, non_blocking=True + x, x_lengths = x.cuda(local_rank, non_blocking=True), x_lengths.cuda( + local_rank, non_blocking=True ) - y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( - rank, non_blocking=True + spec, spec_lengths = spec.cuda( + local_rank, non_blocking=True + ), spec_lengths.cuda(local_rank, non_blocking=True) + y, y_lengths = y.cuda(local_rank, non_blocking=True), y_lengths.cuda( + local_rank, non_blocking=True ) - speakers = speakers.cuda(rank, non_blocking=True) - tone = tone.cuda(rank, non_blocking=True) - language = language.cuda(rank, non_blocking=True) - bert = bert.cuda(rank, non_blocking=True) - ja_bert = ja_bert.cuda(rank, non_blocking=True) - en_bert = en_bert.cuda(rank, non_blocking=True) + speakers = speakers.cuda(local_rank, non_blocking=True) + tone = tone.cuda(local_rank, non_blocking=True) + language = language.cuda(local_rank, non_blocking=True) + bert = bert.cuda(local_rank, non_blocking=True) + ja_bert = ja_bert.cuda(local_rank, non_blocking=True) + en_bert = en_bert.cuda(local_rank, non_blocking=True) with autocast(enabled=hps.train.fp16_run): (