Skip to content

Commit

Permalink
[TTS] Implement new TextToSpeech dataset
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan <rlangman@nvidia.com>
  • Loading branch information
rlangman committed May 12, 2023
1 parent 232f9de commit 2f95fb4
Show file tree
Hide file tree
Showing 8 changed files with 742 additions and 59 deletions.
220 changes: 220 additions & 0 deletions examples/tts/conf/fastpitch/fastpitch_22050.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
# This config contains the default values for training a FastPitch model with aligner.
# If you want to train a model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.

name: FastPitch

max_epochs: ???
batch_size: 32
weighted_sample_steps: null

n_speakers: ???
speaker_path: null
feature_stats_path: null

train_ds_meta: ???
val_ds_meta: ???

phoneme_dict_path: ???
heteronyms_path: ???

defaults:
- feature: feature_22050

model:
learn_alignment: true
bin_loss_warmup_epochs: 100

n_speakers: ${n_speakers}
n_mel_channels: ${feature.mel_feature.mel_dim}
max_token_duration: 75
symbols_embedding_dim: 384
pitch_embedding_kernel_size: 3
energy_embedding_kernel_size: 3
speaker_emb_condition_prosody: true
speaker_emb_condition_aligner: true
use_log_energy: false
dur_loss_scale: 0.1
pitch_loss_scale: 0.1
energy_loss_scale: 0.1

preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
features: ${feature.mel_feature.mel_dim}
lowfreq: ${feature.mel_feature.lowfreq}
highfreq: ${feature.mel_feature.highfreq}
n_fft: ${feature.win_length}
n_window_size: ${feature.win_length}
window_size: false
n_window_stride: ${feature.hop_length}
window_stride: false
pad_to: 1
pad_value: 0
sample_rate: ${feature.sample_rate}
window: hann
normalize: null
preemph: null
dither: 0.0
frame_splicing: 1
log: true
log_zero_guard_type: add
log_zero_guard_value: 1.0
mag_power: 1.0
mel_norm: null

text_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
punct: true
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
phoneme_probability: 0.8
# Relies on the heteronyms list for anything that needs to be disambiguated
ignore_ambiguous_words: false
use_chars: true
use_stresses: true

pitch_processor:
_target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization
field: pitch
stats_path: ${feature_stats_path}

energy_processor:
_target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization
field: energy
stats_path: ${feature_stats_path}

align_prior_config:
_target_: nemo.collections.tts.data.text_to_speech_dataset.AlignPriorConfig
hop_length: ${feature.hop_length}
use_beta_binomial_interpolator: false

train_ds:
dataset:
_target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset
dataset_meta: ${train_ds_meta}
weighted_sample_steps: ${weighted_sample_steps}
sample_rate: ${feature.sample_rate}
speaker_path: ${speaker_path}
featurizers: ${feature.featurizers}
feature_processors:
pitch: ${model.pitch_processor}
energy: ${model.energy_processor}
align_prior_config: ${model.align_prior_config}
min_duration: 0.1
max_duration: 10.0

dataloader_params:
batch_size: ${batch_size}
drop_last: true
num_workers: 8

validation_ds:
dataset:
_target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset
dataset_meta: ${val_ds_meta}
sample_rate: ${feature.sample_rate}
speaker_path: ${speaker_path}
featurizers: ${feature.featurizers}
feature_processors:
pitch: ${model.pitch_processor}
energy: ${model.energy_processor}
align_prior_config: ${model.align_prior_config}

dataloader_params:
batch_size: ${batch_size}
drop_last: false
num_workers: 2

input_fft:
_target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
n_layer: 6
n_head: 2
d_model: ${model.symbols_embedding_dim}
d_head: 64
d_inner: 1536
kernel_size: 3
dropout: 0.1
dropatt: 0.1
dropemb: 0.0
d_embed: ${model.symbols_embedding_dim}

output_fft:
_target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
n_layer: 6
n_head: 1
d_model: ${model.symbols_embedding_dim}
d_head: 64
d_inner: 1536
kernel_size: 3
dropout: 0.1
dropatt: 0.1
dropemb: 0.0

alignment_module:
_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
n_text_channels: ${model.symbols_embedding_dim}

duration_predictor:
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
input_size: ${model.symbols_embedding_dim}
kernel_size: 3
filter_size: 256
dropout: 0.1
n_layers: 2

pitch_predictor:
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
input_size: ${model.symbols_embedding_dim}
kernel_size: 3
filter_size: 256
dropout: 0.1
n_layers: 2

energy_predictor:
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
input_size: ${model.symbols_embedding_dim}
kernel_size: 3
filter_size: 256
dropout: 0.1
n_layers: 2

optim:
name: adamw
lr: 1e-3
betas: [0.9, 0.999]
weight_decay: 1e-6

sched:
name: NoamAnnealing
warmup_steps: 1000
last_epoch: -1
d_model: 1 # Disable scaling based on model dim

trainer:
num_nodes: 1
devices: 1
accelerator: gpu
strategy: ddp
precision: 16
max_epochs: ${max_epochs}
accumulate_grad_batches: 1
gradient_clip_val: 10.0
enable_checkpointing: false # Provided by exp_manager
logger: false # Provided by exp_manager
log_every_n_steps: 100
check_val_every_n_epoch: 10
benchmark: false

exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: val_loss
resume_if_exists: false
resume_ignore_no_checkpoint: false
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,25 @@ hop_length: 256

mel_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
sample_rate: ${..sample_rate}
win_length: ${..win_length}
hop_length: ${..hop_length}
mel_dim: 80
lowfreq: 0
highfreq: 8000

pitch_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
sample_rate: ${..sample_rate}
win_length: ${..win_length}
hop_length: ${..hop_length}
pitch_fmin: 60
pitch_fmax: 640

energy_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer
spec_featurizer: ${mel_feature}
spec_featurizer: ${..mel_feature}

featurizers:
pitch: ${pitch_feature}
energy: ${energy_feature}
pitch: ${..pitch_feature}
energy: ${..energy_feature}
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,25 @@ hop_length: 512

mel_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
sample_rate: ${..sample_rate}
win_length: ${..win_length}
hop_length: ${..hop_length}
mel_dim: 80
lowfreq: 0
highfreq: null

pitch_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
sample_rate: ${..sample_rate}
win_length: ${..win_length}
hop_length: ${..hop_length}
pitch_fmin: 60
pitch_fmax: 640

energy_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer
spec_featurizer: ${mel_feature}
spec_featurizer: ${..mel_feature}

featurizers:
pitch: ${pitch_feature}
energy: ${energy_feature}
pitch: ${..pitch_feature}
energy: ${..energy_feature}
Loading

0 comments on commit 2f95fb4

Please sign in to comment.