NVIDIA · blisc · Feb 10, 2023 · Jan 26, 2023 · Jan 26, 2023 · Jan 26, 2023
diff --git a/examples/tts/conf/rad-tts_dec_ipa.yaml b/examples/tts/conf/rad-tts_dec_ipa.yaml
@@ -0,0 +1,275 @@
+name: RadTTS
+sample_rate: 22050
+
+train_dataset: ???
+validation_datasets: ???
+ckpt_path: None
+export_dir: ???
+sup_data_path: ???
+sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]
+
+
+
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
+
+# default values from librosa.pyin
+pitch_fmin: 65.40639132514966
+pitch_fmax: 2093.004522404789
+
+# default values for sample_rate=22050
+n_mels: 80
+n_window_size: 1024
+n_window_stride: 256
+n_fft: 1024
+lowfreq: 0
+highfreq: 8000
+window: "hann"
+
+
+phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt"
+heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
+whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
+mapping_file_path: ""
+
+model:
+  target: nemo.collections.tts.models.RadTTSModel
+  bin_loss_start_ratio: 0.2
+  bin_loss_warmup_epochs: 100
+
+  symbols_embedding_dim: 384
+  n_mel_channels: ${n_mels}
+
+  pitch_mean: ${pitch_mean}
+  pitch_std: ${pitch_std}
+
+  text_normalizer:
+    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
+    lang: en
+    input_case: cased
+    whitelist: ${whitelist_path}
+
+  text_normalizer_call_kwargs:
+    verbose: false
+    punct_pre_process: true
+    punct_post_process: true
+
+  text_tokenizer:
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
+    punct: true
+    apostrophe: true
+    pad_with_space: true
+    g2p:
+      _target_: nemo_text_processing.g2p.modules.IPAG2P
+      phoneme_dict: ${phoneme_dict_path}
+      heteronyms: ${heteronyms_path}
+      phoneme_probability: 0.5
+      # Relies on the heteronyms list for anything that needs to be disambiguated
+      ignore_ambiguous_words: true
+      use_chars: true
+      use_stresses: true
+
+  train_ds:
+    dataset:
+      _target_: "nemo.collections.tts.torch.data.TTSDataset"
+      manifest_filepath: ${train_dataset}
+      sample_rate: ${sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${n_fft}
+      win_length: ${n_window_size}
+      hop_length: ${n_window_stride}
+      window: ${window}
+      n_mels: ${n_mels}
+      lowfreq: ${lowfreq}
+      highfreq: ${highfreq}
+      max_duration: null
+      min_duration: 0.1
+      ignore_file: null
+      trim: False
+      pitch_fmin: ${pitch_fmin}
+      pitch_fmax: ${pitch_fmax}
+
+
+
+      text_tokenizer:
+        _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
+        punct: True
+        stresses: True
+        chars: True
+        space: ' '
+        silence: null
+        apostrophe: True
+        sep: '|'
+        add_blank_at: null
+        pad_with_space: True
+        g2p:
+          _target_: "nemo_text_processing.g2p.modules.EnglishG2p"
+          phoneme_dict: ${phoneme_dict_path}
+          heteronyms: ${heteronyms_path}
+          phoneme_probability: 0.5
+    dataloader_params:
+      drop_last: false
+      shuffle: true
+      batch_size: 8
+      num_workers: 8
+      pin_memory: false
+
+  validation_ds:
+    dataset:
+      _target_: "nemo.collections.tts.torch.data.TTSDataset"
+      manifest_filepath: ${validation_datasets}
+      sample_rate: ${sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${n_fft}
+      win_length: ${n_window_size}
+      hop_length: ${n_window_stride}
+      window: ${window}
+      n_mels: ${n_mels}
+      lowfreq: ${lowfreq}
+      highfreq: ${highfreq}
+      max_duration: null
+      min_duration: 0.1
+      ignore_file: null
+      trim: False
+      pitch_fmin: ${pitch_fmin}
+      pitch_fmax: ${pitch_fmax}
+
+      text_tokenizer:
+        _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
+        punct: True
+        stresses: True
+        chars: True
+        space: ' '
+        silence: null
+        apostrophe: True
+        sep: '|'
+        add_blank_at: null
+        pad_with_space: True
+        g2p:
+          _target_: "nemo_text_processing.g2p.modules.EnglishG2p"
+          phoneme_dict: ${phoneme_dict_path}
+          heteronyms: ${heteronyms_path}
+          phoneme_probability: 0.5
+    dataloader_params:
+      drop_last: false
+      shuffle: false
+      batch_size: 8
+      num_workers: 8
+      pin_memory: false
+
+  optim:
+    name: RAdam
+    lr: 0.0001
+    betas: [0.9, 0.98]
+    weight_decay: 0.000001
+
+    sched:
+      name: exp_decay
+      warmup_steps: 40000
+      last_epoch: -1
+      d_model: 1  # Disable scaling based on model dim
+  trainerConfig:
+    sigma: 1
+    iters_per_checkpoint: 3000
+    seed: null
+    ignore_layers: []
+    finetune_layers: []
+    include_layers: []
+    with_tensorboard: true
+    dur_loss_weight: 1
+    ctc_loss_weight: 1
+    mask_unvoiced_f0: false
+    log_step: 1
+    binarization_start_iter: 6000
+    kl_loss_start_iter: 18000
+    loss_weights:
+        ctc_loss_weight: 0.1
+        dur_loss_weight: 1.0
+        f0_loss_weight: 1.0
+        energy_loss_weight: 1.0
+        vpred_loss_weight: 1.0
+    unfreeze_modules: "all"
+
+  load_from_checkpoint: False
+  init_from_ptl_ckpt: ${ckpt_path}
+  modelConfig:
+        _target_: "nemo.collections.tts.modules.radtts.RadTTSModule"
+        n_speakers: 1
+        n_speaker_dim: 16
+        n_text: 384 #185
+        n_text_dim: 512
+        n_flows: 8
+        n_conv_layers_per_step: 4
+        n_mel_channels: 80
+        n_hidden: 1024
+        mel_encoder_n_hidden: 512
+        dummy_speaker_embedding: false
+        n_early_size: 2
+        n_early_every: 2
+        n_group_size: 2
+        affine_model: wavenet
+        include_modules: "decatnvpred"
+        scaling_fn: tanh
+        matrix_decomposition: LUS
+        learn_alignments: true
+        use_context_lstm: true
+        context_lstm_norm: spectral
+        context_lstm_w_f0_and_energy: true
+        text_encoder_lstm_norm: spectral
+        n_f0_dims: 1
+        n_energy_avg_dims: 1
+        use_first_order_features: false
+        unvoiced_bias_activation: "relu"
+        decoder_use_partial_padding: false
+        decoder_use_unvoiced_bias: true
+        ap_pred_log_f0: true
+        ap_use_unvoiced_bias: true
+        ap_use_voiced_embeddings: true
+        dur_model_config: null
+        f0_model_config: null
+        energy_model_config: null
+        v_model_config :
+             name : dap
+             hparams :
+                n_speaker_dim : 16
+                take_log_of_input: false
+                bottleneck_hparams:
+                    in_dim: 512
+                    reduction_factor: 16
+                    norm: weightnorm
+                    non_linearity: relu
+                arch_hparams:
+                    out_dim: 1
+                    n_layers: 2
+                    n_channels: 256
+                    kernel_size: 3
+                    p_dropout: 0.5
+
+trainer:
+  devices: 8
+  precision: 16
+  max_epochs: 1000
+  num_nodes: 1
+  accelerator: gpu
+  strategy: ddp
+  accumulate_grad_batches: 1
+  enable_checkpointing: False
+  logger: False
+  gradient_clip_val: 1
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 5
+
+exp_manager:
+  exp_dir: ${export_dir}
+  name: ${name}
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val/loss_ctc
+    mode: min
+    filepath: ${export_dir}
+    filename: model_checkpoint