Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TTS] Fix adapter duration issue #6697

Merged
merged 9 commits into from
May 22, 2023
3 changes: 3 additions & 0 deletions examples/tts/conf/fastpitch_align_44100_adapter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"

model:
unfreeze_aligner: false
unfreeze_duration_predictor: false
unfreeze_pitch_predictor: false
learn_alignment: true
bin_loss_warmup_epochs: 100

Expand Down
12 changes: 12 additions & 0 deletions examples/tts/fastpitch_finetune_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,18 @@ def main(cfg):
if adapter_global_cfg is not None:
add_global_adapter_cfg(model, adapter_global_cfg)

if cfg.model.get("unfreeze_aligner", False):
for name, param in model.fastpitch.aligner.named_parameters():
param.requires_grad = True

if cfg.model.get("unfreeze_duration_predictor", False):
for name, param in model.fastpitch.duration_predictor.named_parameters():
param.requires_grad = True

if cfg.model.get("unfreeze_pitch_predictor", False):
for name, param in model.fastpitch.pitch_predictor.named_parameters():
param.requires_grad = True

# Add adapters
model.add_adapter(name=adapter_name, cfg=cfg.model.adapter)
assert model.is_adapter_available()
Expand Down
11 changes: 8 additions & 3 deletions nemo/collections/tts/losses/aligner_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@


class ForwardSumLoss(Loss):
def __init__(self, blank_logprob=-1):
def __init__(self, blank_logprob=-1, loss_scale=1.0):
super().__init__()
self.log_softmax = torch.nn.LogSoftmax(dim=-1)
self.ctc_loss = torch.nn.CTCLoss(zero_infinity=True)
self.blank_logprob = blank_logprob
self.loss_scale = loss_scale

@property
def input_types(self):
Expand Down Expand Up @@ -67,13 +68,15 @@ def forward(self, attn_logprob, in_lens, out_lens):

# Evaluate CTC loss
cost = self.ctc_loss(attn_logprob, target_seqs, input_lengths=query_lens, target_lengths=key_lens)
cost *= self.loss_scale

return cost


class BinLoss(Loss):
def __init__(self):
def __init__(self, loss_scale=1.0):
super().__init__()
self.loss_scale = loss_scale

@property
def input_types(self):
Expand All @@ -91,4 +94,6 @@ def output_types(self):
@typecheck()
def forward(self, hard_attention, soft_attention):
log_sum = torch.log(torch.clamp(soft_attention[hard_attention == 1], min=1e-12)).sum()
return -log_sum / hard_attention.sum()
loss = -log_sum / hard_attention.sum()
loss *= self.loss_scale
return loss
5 changes: 3 additions & 2 deletions nemo/collections/tts/models/fastpitch.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):

self.aligner = None
if self.learn_alignment:
aligner_loss_scale = cfg.aligner_loss_scale if "aligner_loss_scale" in cfg else 1.0
self.aligner = instantiate(self._cfg.alignment_module)
self.forward_sum_loss_fn = ForwardSumLoss()
self.bin_loss_fn = BinLoss()
self.forward_sum_loss_fn = ForwardSumLoss(loss_scale=aligner_loss_scale)
github-advanced-security[bot] marked this conversation as resolved.
Fixed
Show resolved Hide resolved
self.bin_loss_fn = BinLoss(loss_scale=aligner_loss_scale)

self.preprocessor = instantiate(self._cfg.preprocessor)
input_fft = instantiate(self._cfg.input_fft, **input_fft_kwargs)
Expand Down
Loading