Skip to content

Commit

Permalink
更新并完善分布式训练功能 (yl4579#186)
Browse files Browse the repository at this point in the history
* 修复多机训练问题

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* 更新并完善分布式训练功能

近期融合V2版本代码时发现之前修改的多机功能并不正确,仍会报错,只不过单机多卡情况下local_rank即相当于rank,感知不出
1. 修复train_ms.py中DDP初始化及.cuda绑定到local_rank上
2. 在default_config.yml配置文件中添加env变量 LOCAL_RANK,否则默认情况下会key error
3. 添加run_MnodesAndMgpus.sh,更新分布式相关说明

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Lvjinhong and pre-commit-ci[bot] authored Nov 16, 2023
1 parent 1fbddf4 commit 31de84e
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 27 deletions.
1 change: 1 addition & 0 deletions default_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ train_ms:
MASTER_ADDR: "localhost"
MASTER_PORT: 10086
WORLD_SIZE: 1
LOCAL_RANK: 0
RANK: 0
# 可以填写任意名的环境变量
# THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
Expand Down
7 changes: 0 additions & 7 deletions run_Mgpus.sh

This file was deleted.

31 changes: 31 additions & 0 deletions run_MnodesAndMgpus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#多机多卡训练

#--nnodes=1:3 表示 使用一到三台机器 弹性分配资源
#--nnodes=<最小节点数>:<最大节点数>
#--nproc_per_node=每台机器上可用的GPU数
#--rdzv_endpoint=主节点(最先启动的)ip:端口号
#其他不需要变

#注意: 此版本的分布式训练是基于数据并行的,多机多卡相当于开更大的batchsize,此时epoch迭代速度会增加,
#但由于 该版本的代码中 保存模型是按照global step来计算的,所以会出现的效果就是 : 保存模型的时间不会有明显加速,
#但每次保存模型时epoch都比之前迭代了更多次,也就是 “更少的步数,实现更好的效果”

#*************************
# torchrun \
# --nnodes=1:3\
# --nproc_per_node=2\
# --rdzv_id=1\
# --rdzv_backend=c10d\
# --rdzv_endpoint="inspur1:8880"\
# train_ms.py
#****************************

#多卡训练
#nproc_per_node = 机器上可用的GPU数

#*************************
torchrun \
--nnodes=1\
--nproc_per_node=2\
train_ms.py
#*************************
68 changes: 48 additions & 20 deletions train_ms.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,17 @@ def run():
envs = config.train_ms_config.env
for env_name, env_value in envs.items():
if env_name not in os.environ.keys():
print("加载config中的配置{}".format(str(env_value)))
os.environ[env_name] = str(env_value)
print(
"加载环境变量 \nMASTER_ADDR: {},\nMASTER_PORT: {},\nWORLD_SIZE: {},\nRANK: {},\nLOCAL_RANK: {}".format(
os.environ["MASTER_ADDR"],
os.environ["MASTER_PORT"],
os.environ["WORLD_SIZE"],
os.environ["RANK"],
os.environ["LOCAL_RANK"],
)
)

# 多卡训练设置
backend = "nccl"
Expand Down Expand Up @@ -162,7 +172,7 @@ def run():
3,
0.1,
gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0,
).cuda(rank)
).cuda(local_rank)
if (
"use_spk_conditioned_encoder" in hps.model.keys()
and hps.model.use_spk_conditioned_encoder is True
Expand All @@ -182,9 +192,9 @@ def run():
mas_noise_scale_initial=mas_noise_scale_initial,
noise_scale_delta=noise_scale_delta,
**hps.model,
).cuda(rank)
).cuda(local_rank)

net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(local_rank)
optim_g = torch.optim.AdamW(
filter(lambda p: p.requires_grad, net_g.parameters()),
hps.train.learning_rate,
Expand All @@ -206,11 +216,13 @@ def run():
)
else:
optim_dur_disc = None
net_g = DDP(net_g, device_ids=[rank])
net_d = DDP(net_d, device_ids=[rank])
net_g = DDP(net_g, device_ids=[local_rank])
net_d = DDP(net_d, device_ids=[local_rank])
dur_resume_lr = None
if net_dur_disc is not None:
net_dur_disc = DDP(net_dur_disc, device_ids=[rank], find_unused_parameters=True)
net_dur_disc = DDP(
net_dur_disc, device_ids=[local_rank], find_unused_parameters=True
)

# 下载底模
if config.train_ms_config.base["use_base_model"]:
Expand Down Expand Up @@ -256,6 +268,9 @@ def run():

epoch_str = max(epoch_str, 1)
global_step = (epoch_str - 1) * len(train_loader)
print(
f"******************检测到模型存在,epoch为 {epoch_str},gloabl step为 {global_step}*********************"
)
except Exception as e:
print(e)
epoch_str = 1
Expand All @@ -281,6 +296,7 @@ def run():
if rank == 0:
train_and_evaluate(
rank,
local_rank,
epoch,
hps,
[net_g, net_d, net_dur_disc],
Expand All @@ -294,6 +310,7 @@ def run():
else:
train_and_evaluate(
rank,
local_rank,
epoch,
hps,
[net_g, net_d, net_dur_disc],
Expand All @@ -311,7 +328,17 @@ def run():


def train_and_evaluate(
rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers
rank,
local_rank,
epoch,
hps,
nets,
optims,
schedulers,
scaler,
loaders,
logger,
writers,
):
net_g, net_d, net_dur_disc = nets
optim_g, optim_d, optim_dur_disc = optims
Expand All @@ -327,6 +354,7 @@ def train_and_evaluate(
net_d.train()
if net_dur_disc is not None:
net_dur_disc.train()

for batch_idx, (
x,
x_lengths,
Expand All @@ -347,21 +375,21 @@ def train_and_evaluate(
- net_g.module.noise_scale_delta * global_step
)
net_g.module.current_mas_noise_scale = max(current_mas_noise_scale, 0.0)
x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(
rank, non_blocking=True
)
spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(
rank, non_blocking=True
x, x_lengths = x.cuda(local_rank, non_blocking=True), x_lengths.cuda(
local_rank, non_blocking=True
)
y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(
rank, non_blocking=True
spec, spec_lengths = spec.cuda(
local_rank, non_blocking=True
), spec_lengths.cuda(local_rank, non_blocking=True)
y, y_lengths = y.cuda(local_rank, non_blocking=True), y_lengths.cuda(
local_rank, non_blocking=True
)
speakers = speakers.cuda(rank, non_blocking=True)
tone = tone.cuda(rank, non_blocking=True)
language = language.cuda(rank, non_blocking=True)
bert = bert.cuda(rank, non_blocking=True)
ja_bert = ja_bert.cuda(rank, non_blocking=True)
en_bert = en_bert.cuda(rank, non_blocking=True)
speakers = speakers.cuda(local_rank, non_blocking=True)
tone = tone.cuda(local_rank, non_blocking=True)
language = language.cuda(local_rank, non_blocking=True)
bert = bert.cuda(local_rank, non_blocking=True)
ja_bert = ja_bert.cuda(local_rank, non_blocking=True)
en_bert = en_bert.cuda(local_rank, non_blocking=True)

with autocast(enabled=hps.train.fp16_run):
(
Expand Down

0 comments on commit 31de84e

Please sign in to comment.