[TTS] Implement new TextToSpeech dataset

Signed-off-by: Ryan <rlangman@nvidia.com>
NVIDIA · May 12, 2023 · 2f95fb4 · 2f95fb4
1 parent 232f9de
commit 2f95fb4
Show file tree

Hide file tree

Showing 8 changed files with 742 additions and 59 deletions.
diff --git a/examples/tts/conf/fastpitch/fastpitch_22050.yaml b/examples/tts/conf/fastpitch/fastpitch_22050.yaml
@@ -0,0 +1,220 @@
+# This config contains the default values for training a FastPitch model with aligner.
+# If you want to train a model on other dataset, you can change config values according to your dataset.
+# Most dataset-specific arguments are in the head of the config file, see below.
+
+name: FastPitch
+
+max_epochs: ???
+batch_size: 32
+weighted_sample_steps: null
+
+n_speakers: ???
+speaker_path: null
+feature_stats_path: null
+
+train_ds_meta: ???
+val_ds_meta: ???
+
+phoneme_dict_path: ???
+heteronyms_path: ???
+
+defaults:
+  - feature: feature_22050
+
+model:
+  learn_alignment: true
+  bin_loss_warmup_epochs: 100
+
+  n_speakers: ${n_speakers}
+  n_mel_channels: ${feature.mel_feature.mel_dim}
+  max_token_duration: 75
+  symbols_embedding_dim: 384
+  pitch_embedding_kernel_size: 3
+  energy_embedding_kernel_size: 3
+  speaker_emb_condition_prosody: true
+  speaker_emb_condition_aligner: true
+  use_log_energy: false
+  dur_loss_scale: 0.1
+  pitch_loss_scale: 0.1
+  energy_loss_scale: 0.1
+
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    features: ${feature.mel_feature.mel_dim}
+    lowfreq: ${feature.mel_feature.lowfreq}
+    highfreq: ${feature.mel_feature.highfreq}
+    n_fft: ${feature.win_length}
+    n_window_size: ${feature.win_length}
+    window_size: false
+    n_window_stride: ${feature.hop_length}
+    window_stride: false
+    pad_to: 1
+    pad_value: 0
+    sample_rate: ${feature.sample_rate}
+    window: hann
+    normalize: null
+    preemph: null
+    dither: 0.0
+    frame_splicing: 1
+    log: true
+    log_zero_guard_type: add
+    log_zero_guard_value: 1.0
+    mag_power: 1.0
+    mel_norm: null
+
+  text_tokenizer:
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
+    punct: true
+    apostrophe: true
+    pad_with_space: true
+    g2p:
+      _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+      phoneme_dict: ${phoneme_dict_path}
+      heteronyms: ${heteronyms_path}
+      phoneme_probability: 0.8
+      # Relies on the heteronyms list for anything that needs to be disambiguated
+      ignore_ambiguous_words: false
+      use_chars: true
+      use_stresses: true
+
+  pitch_processor:
+    _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization
+    field: pitch
+    stats_path: ${feature_stats_path}
+
+  energy_processor:
+    _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization
+    field: energy
+    stats_path: ${feature_stats_path}
+
+  align_prior_config:
+    _target_: nemo.collections.tts.data.text_to_speech_dataset.AlignPriorConfig
+    hop_length: ${feature.hop_length}
+    use_beta_binomial_interpolator: false
+
+  train_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset
+      dataset_meta: ${train_ds_meta}
+      weighted_sample_steps: ${weighted_sample_steps}
+      sample_rate: ${feature.sample_rate}
+      speaker_path: ${speaker_path}
+      featurizers: ${feature.featurizers}
+      feature_processors:
+        pitch: ${model.pitch_processor}
+        energy: ${model.energy_processor}
+      align_prior_config: ${model.align_prior_config}
+      min_duration: 0.1
+      max_duration: 10.0
+
+    dataloader_params:
+      batch_size: ${batch_size}
+      drop_last: true
+      num_workers: 8
+
+  validation_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset
+      dataset_meta: ${val_ds_meta}
+      sample_rate: ${feature.sample_rate}
+      speaker_path: ${speaker_path}
+      featurizers: ${feature.featurizers}
+      feature_processors:
+        pitch: ${model.pitch_processor}
+        energy: ${model.energy_processor}
+      align_prior_config: ${model.align_prior_config}
+
+    dataloader_params:
+      batch_size: ${batch_size}
+      drop_last: false
+      num_workers: 2
+
+  input_fft:
+    _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
+    n_layer: 6
+    n_head: 2
+    d_model: ${model.symbols_embedding_dim}
+    d_head: 64
+    d_inner: 1536
+    kernel_size: 3
+    dropout: 0.1
+    dropatt: 0.1
+    dropemb: 0.0
+    d_embed: ${model.symbols_embedding_dim}
+
+  output_fft:
+    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
+    n_layer: 6
+    n_head: 1
+    d_model: ${model.symbols_embedding_dim}
+    d_head: 64
+    d_inner: 1536
+    kernel_size: 3
+    dropout: 0.1
+    dropatt: 0.1
+    dropemb: 0.0
+
+  alignment_module:
+    _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
+    n_text_channels: ${model.symbols_embedding_dim}
+
+  duration_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+
+  pitch_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+
+  energy_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+
+  optim:
+    name: adamw
+    lr: 1e-3
+    betas: [0.9, 0.999]
+    weight_decay: 1e-6
+
+    sched:
+      name: NoamAnnealing
+      warmup_steps: 1000
+      last_epoch: -1
+      d_model: 1  # Disable scaling based on model dim
+
+trainer:
+  num_nodes: 1
+  devices: 1
+  accelerator: gpu
+  strategy: ddp
+  precision: 16
+  max_epochs: ${max_epochs}
+  accumulate_grad_batches: 1
+  gradient_clip_val: 10.0
+  enable_checkpointing: false # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 10
+  benchmark: false
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
diff --git a/...ples/tts/conf/features/feature_22050.yaml → examples/tts/conf/feature/feature_22050.yaml b/...ples/tts/conf/features/feature_22050.yaml → examples/tts/conf/feature/feature_22050.yaml
@@ -4,25 +4,25 @@ hop_length: 256
 
 mel_feature:
   _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer
-  sample_rate: ${sample_rate}
-  win_length: ${win_length}
-  hop_length: ${hop_length}
+  sample_rate: ${..sample_rate}
+  win_length: ${..win_length}
+  hop_length: ${..hop_length}
   mel_dim: 80
   lowfreq: 0
   highfreq: 8000
 
 pitch_feature:
   _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer
-  sample_rate: ${sample_rate}
-  win_length: ${win_length}
-  hop_length: ${hop_length}
+  sample_rate: ${..sample_rate}
+  win_length: ${..win_length}
+  hop_length: ${..hop_length}
   pitch_fmin: 60
   pitch_fmax: 640
 
 energy_feature:
   _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer
-  spec_featurizer: ${mel_feature}
+  spec_featurizer: ${..mel_feature}
 
 featurizers:
-  pitch: ${pitch_feature}
-  energy: ${energy_feature}
+  pitch: ${..pitch_feature}
+  energy: ${..energy_feature}
diff --git a/...ples/tts/conf/features/feature_44100.yaml → examples/tts/conf/feature/feature_44100.yaml b/...ples/tts/conf/features/feature_44100.yaml → examples/tts/conf/feature/feature_44100.yaml
@@ -4,25 +4,25 @@ hop_length: 512
 
 mel_feature:
   _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer
-  sample_rate: ${sample_rate}
-  win_length: ${win_length}
-  hop_length: ${hop_length}
+  sample_rate: ${..sample_rate}
+  win_length: ${..win_length}
+  hop_length: ${..hop_length}
   mel_dim: 80
   lowfreq: 0
   highfreq: null
 
 pitch_feature:
   _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer
-  sample_rate: ${sample_rate}
-  win_length: ${win_length}
-  hop_length: ${hop_length}
+  sample_rate: ${..sample_rate}
+  win_length: ${..win_length}
+  hop_length: ${..hop_length}
   pitch_fmin: 60
   pitch_fmax: 640
 
 energy_feature:
   _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer
-  spec_featurizer: ${mel_feature}
+  spec_featurizer: ${..mel_feature}
 
 featurizers:
-  pitch: ${pitch_feature}
-  energy: ${energy_feature}
+  pitch: ${..pitch_feature}
+  energy: ${..energy_feature}