Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[New Model] HiFiGAN #422

Merged
merged 48 commits into from
Apr 12, 2021
Merged
Show file tree
Hide file tree
Changes from 42 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
4493feb
Add HiFi-GAN v1 generator and discriminator classes
rishikksh20 Feb 13, 2021
7b7c5d6
1) Combine MSD with Multi-Period disc
rishikksh20 Feb 17, 2021
39b5845
1) Add hifigan json files
rishikksh20 Feb 23, 2021
c20a6b1
* Format the model definition
rishikksh20 Feb 24, 2021
1535777
1) Add ExponentialLR
rishikksh20 Feb 24, 2021
ef6ff4e
Add Exponential LR scheduler check
rishikksh20 Mar 3, 2021
b533474
Remove minor bugs and make code trainable
rishikksh20 Mar 3, 2021
e656e8b
Remove select size bug
rishikksh20 Mar 5, 2021
8d4fd79
update hifigan config
erogol Apr 4, 2021
a14d7bc
hifigan config update
erogol Apr 5, 2021
8c9e1c9
hifigan implementation update
erogol Apr 5, 2021
d57f416
small fixes
erogol Apr 5, 2021
f0e76ee
initial models.json entry for universal hifigan
erogol Apr 6, 2021
e0e3b12
pass all parameters explicity to _istft
erogol Apr 6, 2021
4a5b1d4
update hifigan config
erogol Apr 6, 2021
ff07c5f
update TorchSTFT to enable melspec
erogol Apr 6, 2021
de3a04f
some commeting for Generator loss and check if the argument is define…
erogol Apr 6, 2021
241e968
load_checkpoint for hifigan and no_grad for inference
erogol Apr 6, 2021
67f8248
placeholder for finetuned sam hifigan model
erogol Apr 7, 2021
57f6bd1
make using different samples for G and D networks optional
erogol Apr 7, 2021
7726dfc
change the upper bound in sound normalization
erogol Apr 7, 2021
bd7a1c1
fix #419
erogol Apr 7, 2021
d95b145
Linter fixes and docstrings for HiFiGAN
erogol Apr 7, 2021
cf44624
more docstring
erogol Apr 7, 2021
02bc776
prevenet grad in TorchSTFT
erogol Apr 7, 2021
13dca6e
revert some of Hifigan generator updates
erogol Apr 7, 2021
7cecd2f
add hifigan D
erogol Apr 7, 2021
2a872c9
don't call os.exit as it leaves the process resources standing
erogol Apr 7, 2021
3fb78c0
move scheduler updates to the end of the epoch
erogol Apr 8, 2021
8daf407
cache empty
erogol Apr 8, 2021
4998ece
allow configuration of optimziers from the config file
erogol Apr 8, 2021
6ee211c
remove stft params causing warning
erogol Apr 8, 2021
aee24b0
set different seed in gan_dataset when it is multi-workers
erogol Apr 8, 2021
15f362d
formatting
erogol Apr 8, 2021
773f1db
refactor HifiGAN discriminator
erogol Apr 8, 2021
0ee0458
remove redundant imports
erogol Apr 8, 2021
ba80e82
udpate gan_datasets tests
erogol Apr 8, 2021
3f0993a
remove junk
erogol Apr 8, 2021
006b1d3
bug fix
erogol Apr 8, 2021
a7fb498
update test config
erogol Apr 8, 2021
53f5489
small fixes
erogol Apr 8, 2021
4d3e1e9
linter fix
erogol Apr 8, 2021
cd69da4
linter fixes #2
erogol Apr 8, 2021
105e0b4
vocoder gan training fixes
erogol Apr 9, 2021
2b529f6
update default hifigan config
erogol Apr 9, 2021
2c71c6d
[ci skip]update gan vocoder configs to reflect the recent changes
erogol Apr 9, 2021
5b70da2
restore schedulers only if training is continuing a previous training
erogol Apr 9, 2021
d295d5d
remove torch.no_grad from TorchSTFT
erogol Apr 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,13 @@
"author": "Eren Gölge @erogol",
"license": "MPL",
"contact":"egolge@coqui.com"
},
"hifigan":{
"github_rls_url": "",
"commit": "4132240",
"author": "",
"license": "MIT",
"contact":"egolge@coqui.com"
}
}
},
Expand All @@ -141,6 +148,15 @@
"license": "MPL",
"contact":"egolge@coqui.com"
}
},
"sam":{
"hifigan":{
"github_rls_url": "",
"commit": "",
"author": "Eren Gölge @erogol",
"license": "Apache 2.0",
"contact":"egolge@coqui.com"
}
}
},
"nl":{
Expand Down
100 changes: 55 additions & 45 deletions TTS/bin/train_vocoder_gan.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@
from inspect import signature

import torch
# DISTRIBUTED
from torch.nn.parallel import DistributedDataParallel as DDP_th
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from TTS.utils.arguments import parse_arguments, process_args
from TTS.utils.audio import AudioProcessor
from TTS.utils.distribute import init_distributed
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
remove_experiment_folder, set_init_dict)

from TTS.utils.radam import RAdam

from TTS.utils.training import setup_torch_training_env
from TTS.vocoder.datasets.gan_dataset import GANDataset
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
Expand All @@ -25,11 +26,6 @@
setup_generator)
from TTS.vocoder.utils.io import save_best_model, save_checkpoint

# DISTRIBUTED
from torch.nn.parallel import DistributedDataParallel as DDP_th
from torch.utils.data.distributed import DistributedSampler
from TTS.utils.distribute import init_distributed

use_cuda, num_gpus = setup_torch_training_env(True, True)


Expand All @@ -42,6 +38,7 @@ def setup_loader(ap, is_val=False, verbose=False):
hop_len=ap.hop_length,
pad_short=c.pad_short,
conv_pad=c.conv_pad,
return_pairs=c.diff_samples_for_G_and_D if 'diff_samples_for_G_and_D' in c else False,
is_training=not is_val,
return_segments=not is_val,
use_noise_augment=c.use_noise_augment,
Expand All @@ -62,25 +59,19 @@ def setup_loader(ap, is_val=False, verbose=False):

def format_data(data):
if isinstance(data[0], list):
# setup input data
c_G, x_G = data[0]
c_D, x_D = data[1]

# dispatch data to GPU
x_G, y_G = data[0]
x_D, y_D = data[1]
if use_cuda:
c_G = c_G.cuda(non_blocking=True)
x_G = x_G.cuda(non_blocking=True)
c_D = c_D.cuda(non_blocking=True)
y_G = y_G.cuda(non_blocking=True)
x_D = x_D.cuda(non_blocking=True)

return c_G, x_G, c_D, x_D

# return a whole audio segment
co, x = data
y_D = y_D.cuda(non_blocking=True)
return x_G, y_G, x_D, y_D
x, y = data
if use_cuda:
co = co.cuda(non_blocking=True)
x = x.cuda(non_blocking=True)
return co, x, None, None
y = y.cuda(non_blocking=True)
return x, y, None, None


def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
Expand Down Expand Up @@ -143,13 +134,20 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
if D_out_real is None:
feats_real = None
else:
# we don't need scores for real samples for training G since they are always 1
_, feats_real = D_out_real
else:
scores_fake = D_out_fake

# compute losses
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
feats_real, y_hat_sub, y_G_sub)
loss_G_dict = criterion_G(y_hat=y_hat,
y=y_G,
scores_fake=scores_fake,
feats_fake=feats_fake,
feats_real=feats_real,
y_hat_sub=y_hat_sub,
y_sub=y_G_sub)

loss_G = loss_G_dict['G_loss']

# optimizer generator
Expand All @@ -159,8 +157,6 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
torch.nn.utils.clip_grad_norm_(model_G.parameters(),
c.gen_clip_grad)
optimizer_G.step()
if scheduler_G is not None:
scheduler_G.step()

loss_dict = dict()
for key, value in loss_G_dict.items():
Expand All @@ -174,29 +170,37 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
##############################
if global_step >= c.steps_to_start_discriminator:
# discriminator pass
with torch.no_grad():
y_hat = model_G(c_D)
if c.diff_samples_for_G_and_D:
# use a different sample than generator
with torch.no_grad():
y_hat = model_G(c_D)

# PQMF formatting
if y_hat.shape[1] > 1:
y_hat = model_G.pqmf_synthesis(y_hat)
# PQMF formatting
if y_hat.shape[1] > 1:
y_hat = model_G.pqmf_synthesis(y_hat)
else:
# use the same samples as generator
c_D = c_G.clone()
y_D = y_G.clone()

# run D with or without cond. features
if len(signature(model_D.forward).parameters) == 2:
D_out_fake = model_D(y_hat.detach(), c_D)
D_out_fake = model_D(y_hat.detach().clone(), c_D)
D_out_real = model_D(y_D, c_D)
else:
D_out_fake = model_D(y_hat.detach())
D_out_real = model_D(y_D)

# format D outputs
if isinstance(D_out_fake, tuple):
# model_D returns scores and features
scores_fake, feats_fake = D_out_fake
if D_out_real is None:
scores_real, feats_real = None, None
else:
scores_real, feats_real = D_out_real
else:
# model D returns only scores
scores_fake = D_out_fake
scores_real = D_out_real

Expand All @@ -211,8 +215,6 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
torch.nn.utils.clip_grad_norm_(model_D.parameters(),
c.disc_clip_grad)
optimizer_D.step()
if scheduler_D is not None:
scheduler_D.step()

for key, value in loss_D_dict.items():
if isinstance(value, (int, float)):
Expand Down Expand Up @@ -284,6 +286,12 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
c.audio["sample_rate"])
end_time = time.time()

if scheduler_G is not None:
scheduler_G.step()

if scheduler_D is not None:
scheduler_D.step()

# print epoch stats
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)

Expand All @@ -295,6 +303,7 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
# TODO: plot model stats
# if c.tb_model_param_stats:
# tb_logger.tb_model_weights(model, global_step)
torch.cuda.empty_cache()
return keep_avg.avg_values, global_step


Expand All @@ -321,7 +330,7 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
##############################

# generator pass
y_hat = model_G(c_G)
y_hat = model_G(c_G)[:, :, :y_G.size(2)]
y_hat_sub = None
y_G_sub = None

Expand Down Expand Up @@ -373,7 +382,7 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
if global_step >= c.steps_to_start_discriminator:
# discriminator pass
with torch.no_grad():
y_hat = model_G(c_G)
y_hat = model_G(c_G)[:, :, :y_G.size(2)]

# PQMF formatting
if y_hat.shape[1] > 1:
Expand Down Expand Up @@ -436,7 +445,7 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)

# synthesize a full voice
data_loader.return_segments = False

torch.cuda.empty_cache()
return keep_avg.avg_values


Expand Down Expand Up @@ -464,10 +473,13 @@ def main(args): # pylint: disable=redefined-outer-name
model_disc = setup_discriminator(c)

# setup optimizers
optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0)
optimizer_disc = RAdam(model_disc.parameters(),
lr=c.lr_disc,
weight_decay=0)
# TODO: allow loading custom optimizers
optimizer_gen = None
optimizer_disc = None
optimizer_gen = getattr(torch.optim, c.optimizer)
optimizer_gen = optimizer_gen(lr=c.lr_gen, **c.optimizer_params)
optimizer_disc = getattr(torch.optim, c.optimizer)
optimizer_disc= optimizer_disc(lr=c.lr_gen, **c.optimizer_params)

# schedulers
scheduler_gen = None
Expand Down Expand Up @@ -506,6 +518,8 @@ def main(args): # pylint: disable=redefined-outer-name
print(" > Restoring Discriminator LR Scheduler...")
scheduler_disc.load_state_dict(checkpoint['scheduler_disc'])
scheduler_disc.optimizer = optimizer_disc
if c.lr_scheduler_disc == "ExponentialLR":
scheduler_disc.last_epoch = checkpoint['epoch']
except RuntimeError:
# restore only matching layers.
print(" > Partial model initialization...")
Expand Down Expand Up @@ -597,10 +611,6 @@ def main(args): # pylint: disable=redefined-outer-name
main(args)
except KeyboardInterrupt:
remove_experiment_folder(OUT_PATH)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
Expand Down
30 changes: 17 additions & 13 deletions TTS/utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,17 +306,22 @@ def out_linear_to_mel(self, linear_spec):

### STFT and ISTFT ###
def _stft(self, y):
return librosa.stft(
y=y,
n_fft=self.fft_size,
hop_length=self.hop_length,
win_length=self.win_length,
pad_mode=self.stft_pad_mode,
)
return librosa.stft(y=y,
n_fft=self.fft_size,
hop_length=self.hop_length,
win_length=self.win_length,
pad_mode=self.stft_pad_mode,
window='hann',
center=True,
)

def _istft(self, y):
return librosa.istft(
y, hop_length=self.hop_length, win_length=self.win_length)
return librosa.istft(y,
hop_length=self.hop_length,
win_length=self.win_length,
window='hann',
center=True,
)

def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
Expand Down Expand Up @@ -367,7 +372,7 @@ def trim_silence(self, wav):

@staticmethod
def sound_norm(x):
return x / abs(x).max() * 0.9
return x / abs(x).max() * 0.95

### save and load ###
def load_wav(self, filename, sr=None):
Expand All @@ -387,10 +392,9 @@ def load_wav(self, filename, sr=None):
x = self.sound_norm(x)
return x

def save_wav(self, wav, path, sample_rate=None):
sample_rate = self.sample_rate if sample_rate is None else sample_rate
def save_wav(self, wav, path, sr=None):
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, sample_rate, wav_norm.astype(np.int16))
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))

@staticmethod
def mulaw_encode(wav, qc):
Expand Down
Loading