Skip to content

Commit

Permalink
fix unused parameter and add keep n models
Browse files Browse the repository at this point in the history
  • Loading branch information
jerryuhoo committed Oct 29, 2022
1 parent 6ce4134 commit 7c245bb
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 57 deletions.
56 changes: 45 additions & 11 deletions configs/singing_base.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
{
"train": {
"log_interval": 200,
"eval_interval": 10000,
"eval_interval": 2000,
"seed": 1234,
"epochs": 20000,
"learning_rate": 1e-4,
"betas": [0.8, 0.99],
"betas": [
0.8,
0.99
],
"eps": 1e-9,
"batch_size": 16,
"batch_size": 8,
"fp16_run": false,
"lr_decay": 0.999875,
"segment_size": 8192,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
"c_kl": 1.0,
"keep_n_models": 20
},
"data": {
"training_files":"filelists/singing_train.txt",
"validation_files":"filelists/singing_valid.txt",
"training_files": "filelists/singing_train.txt",
"validation_files": "filelists/singing_valid.txt",
"max_wav_value": 32768.0,
"sampling_rate": 16000,
"filter_length": 1024,
Expand All @@ -38,11 +42,41 @@
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [8,8,2,2],
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
8,
8,
2,
2
],
"upsample_initial_channel": 384,
"upsample_kernel_sizes": [16,16,4,4],
"upsample_kernel_sizes": [
16,
16,
4,
4
],
"use_spectral_norm": false
}
}
}
23 changes: 8 additions & 15 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def __init__(
init_size=4000,
)
self.drop = nn.Dropout(p_dropout)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
# self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

def forward(self, phone, score, score_dur, slurs, lengths):
x = self.emb_phone(phone)
Expand Down Expand Up @@ -205,10 +205,11 @@ def forward(self, phone, score, score_dur, slurs, lengths):
x = x + p
x = self.drop(x)

stats = self.proj(x) * x_mask
# stats = self.proj(x) * x_mask

m, logs = torch.split(stats, self.out_channels, dim=1)
return x, m, logs, x_mask
# m, logs = torch.split(stats, self.out_channels, dim=1)
# return x, m, logs, x_mask
return x, x_mask


class ResidualCouplingBlock(nn.Module):
Expand Down Expand Up @@ -654,8 +655,6 @@ def __init__(
self.kernel_size = kernel_size
self.p_dropout = p_dropout

self.emb = nn.Embedding(121, hidden_channels)

self.pitch_net = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
)
Expand Down Expand Up @@ -690,8 +689,6 @@ def __init__(
self.kernel_size = kernel_size
self.p_dropout = p_dropout

self.emb = nn.Embedding(n_vocab, hidden_channels)

self.phonemes_predictor = attentions.Encoder(
hidden_channels, filter_channels, n_heads, 2, kernel_size, p_dropout
)
Expand Down Expand Up @@ -729,8 +726,6 @@ def __init__(
self.kernel_size = kernel_size
self.p_dropout = p_dropout

self.emb = nn.Embedding(121, hidden_channels)

self.fft_block = attentions.Encoder(
hidden_channels, filter_channels, n_heads, 4, kernel_size, p_dropout
)
Expand Down Expand Up @@ -878,9 +873,7 @@ def forward(
y_lengths,
sid=None,
):
x, m_p, logs_p, x_mask = self.enc_p(
phone, score, score_dur, slurs, phone_lengths
)
x, x_mask = self.enc_p(phone, score, score_dur, slurs, phone_lengths)
if self.n_speakers > 0:
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
else:
Expand Down Expand Up @@ -954,7 +947,7 @@ def forward(
def infer(
self, phone, phone_lengths, score, score_dur, slurs, sid=None, max_len=None
):
x, m_p, logs_p, x_mask = self.enc_p(
x, x_mask = self.enc_p(
phone, score, score_dur, slurs, phone_lengths
)
if self.n_speakers > 0:
Expand Down Expand Up @@ -1083,7 +1076,7 @@ def infer(
sid=None,
max_len=None,
):
x, m_p, logs_p, x_mask = self.enc_p(
x, x_mask = self.enc_p(
phone, score, score_dur, slurs, phone_lengths
)
if self.n_speakers > 0:
Expand Down
24 changes: 20 additions & 4 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ def run(rank, n_gpus, hps):
betas=hps.train.betas,
eps=hps.train.eps,
)
net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
# net_g = DDP(net_g, device_ids=[rank])
# net_d = DDP(net_d, device_ids=[rank])
# net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
net_g = DDP(net_g, device_ids=[rank])
net_d = DDP(net_d, device_ids=[rank])

try:
_, _, _, epoch_str = utils.load_checkpoint(
Expand Down Expand Up @@ -301,6 +301,11 @@ def train_and_evaluate(
)
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()

# for name, param in net_g.named_parameters():
# if param.grad is None:
# print(name)

scaler.unscale_(optim_g)
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
Expand Down Expand Up @@ -402,6 +407,17 @@ def train_and_evaluate(
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
)
keep_num = hps.train.keep_n_models
eval_interval = hps.train.eval_interval
if global_step / eval_interval >= keep_num:
os.remove(
hps.model_dir,
"G_{}.pth".format(global_step - keep_num * eval_interval),
)
os.remove(
hps.model_dir,
"D_{}.pth".format(global_step - keep_num * eval_interval),
)
global_step += 1

if rank == 0:
Expand Down
50 changes: 23 additions & 27 deletions vsinging_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from models import SynthesizerTrn
from prepare.data_vits import SingInput
from prepare.data_vits import FeatureInput
from prepare.phone_map import get_vocab_size


def save_wav(wav, path, rate):
Expand All @@ -20,18 +21,23 @@ def save_wav(wav, path, rate):
# define model and load checkpoint
hps = utils.get_hparams_from_file("./configs/singing_base.json")

vocab_size = get_vocab_size()

net_g = SynthesizerTrn(
vocab_size,
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
).cuda()
) # .cuda()

_ = utils.load_checkpoint("./logs/singing_base/G_160000.pth", net_g, None)
_ = utils.load_checkpoint("./logs/singing_base/G_146000.pth", net_g, None)
net_g.eval()
# net_g.remove_weight_norm()

singInput = SingInput(16000, 256)
featureInput = FeatureInput("../VISinger_data/wav_dump_16k/", 16000, 256)
singInput = SingInput(hps.data.sampling_rate, hps.data.hop_length)
featureInput = FeatureInput(
"../VISinger_data/wav_dump_16k/", hps.data.sampling_rate, hps.data.hop_length
)

# check directory existence
if not os.path.exists("./singing_out"):
Expand All @@ -58,46 +64,36 @@ def save_wav(wav, path, rate):
labels_slr,
labels_uvs,
) = singInput.parseInput(message)
labels_ids = singInput.expandInput(labels_ids, labels_frames)
labels_uvs = singInput.expandInput(labels_uvs, labels_frames)
labels_slr = singInput.expandInput(labels_slr, labels_frames)
scores_ids = singInput.expandInput(scores_ids, labels_frames)
scores_pit = singInput.scorePitch(scores_ids)
# elments by elments
scores_pit_ = scores_pit * labels_uvs
scores_pit = singInput.smoothPitch(scores_pit_)

fig = plt.figure(figsize=(12, 6))
plt.plot(scores_pit_.T, "g")
plt.plot(scores_pit.T, "r")
plt.savefig(f"./singing_out/{file}_f0_.png", format="png")
plt.close(fig)

phone = torch.LongTensor(labels_ids)
score = torch.LongTensor(scores_ids)
score_dur = torch.LongTensor(scores_dur)
slurs = torch.LongTensor(labels_slr)
pitch = featureInput.coarse_f0(scores_pit)
pitch = torch.LongTensor(pitch)

phone_lengths = phone.size()[0]

begin_time = time()
with torch.no_grad():
phone = phone.cuda().unsqueeze(0)
score = score.cuda().unsqueeze(0)
pitch = pitch.cuda().unsqueeze(0)
slurs = slurs.cuda().unsqueeze(0)
phone_lengths = torch.LongTensor([phone_lengths]).cuda()
# phone = phone.cuda().unsqueeze(0)
# score = score.cuda().unsqueeze(0)
# pitch = pitch.cuda().unsqueeze(0)
# slurs = slurs.cuda().unsqueeze(0)
# phone_lengths = torch.LongTensor([phone_lengths]).cuda()
phone = phone.unsqueeze(0)
score = score.unsqueeze(0)
score_dur = score_dur.unsqueeze(0)
slurs = slurs.unsqueeze(0)
phone_lengths = torch.LongTensor([phone_lengths])
audio = (
net_g.infer(phone, phone_lengths, score, pitch, slurs)[0][0, 0]
net_g.infer(phone, phone_lengths, score, score_dur, slurs)[0][0, 0]
.data.cpu()
.float()
.numpy()
)
end_time = time()
run_time = end_time - begin_time
print("Syth Time (Seconds):", run_time)
data_len = len(audio) / 16000
data_len = len(audio) / hps.data.sampling_rate
print("Wave Time (Seconds):", data_len)
print("Real time Rate (%):", run_time / data_len)
save_wav(audio, f"./singing_out/{file}.wav", hps.data.sampling_rate)
Expand Down

0 comments on commit 7c245bb

Please sign in to comment.