Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TTS] Implement new TextToSpeech dataset #6575

Merged
merged 4 commits into from
May 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 220 additions & 0 deletions examples/tts/conf/fastpitch/fastpitch_22050.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
# This config contains the default values for training a FastPitch model with aligner.
# If you want to train a model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.

name: FastPitch

max_epochs: ???
batch_size: 32
weighted_sample_steps: null

n_speakers: ???
speaker_path: null
feature_stats_path: null

train_ds_meta: ???
val_ds_meta: ???

phoneme_dict_path: ???
heteronyms_path: ???

defaults:
- feature: feature_22050

model:
learn_alignment: true
bin_loss_warmup_epochs: 100

n_speakers: ${n_speakers}
n_mel_channels: ${feature.mel_feature.mel_dim}
max_token_duration: 75
symbols_embedding_dim: 384
pitch_embedding_kernel_size: 3
energy_embedding_kernel_size: 3
speaker_emb_condition_prosody: true
speaker_emb_condition_aligner: true
use_log_energy: false
dur_loss_scale: 0.1
pitch_loss_scale: 0.1
energy_loss_scale: 0.1

preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
features: ${feature.mel_feature.mel_dim}
lowfreq: ${feature.mel_feature.lowfreq}
highfreq: ${feature.mel_feature.highfreq}
n_fft: ${feature.win_length}
n_window_size: ${feature.win_length}
window_size: false
n_window_stride: ${feature.hop_length}
window_stride: false
pad_to: 1
pad_value: 0
sample_rate: ${feature.sample_rate}
window: hann
normalize: null
preemph: null
dither: 0.0
frame_splicing: 1
log: true
log_zero_guard_type: add
log_zero_guard_value: 1.0
mag_power: 1.0
mel_norm: null

text_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
punct: true
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
phoneme_probability: 0.8
# Relies on the heteronyms list for anything that needs to be disambiguated
ignore_ambiguous_words: false
use_chars: true
use_stresses: true

pitch_processor:
_target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization
field: pitch
stats_path: ${feature_stats_path}

energy_processor:
_target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization
field: energy
stats_path: ${feature_stats_path}

align_prior_config:
_target_: nemo.collections.tts.data.text_to_speech_dataset.AlignPriorConfig
hop_length: ${feature.hop_length}
use_beta_binomial_interpolator: false

train_ds:
dataset:
_target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset
dataset_meta: ${train_ds_meta}
weighted_sample_steps: ${weighted_sample_steps}
sample_rate: ${feature.sample_rate}
speaker_path: ${speaker_path}
featurizers: ${feature.featurizers}
feature_processors:
pitch: ${model.pitch_processor}
energy: ${model.energy_processor}
align_prior_config: ${model.align_prior_config}
min_duration: 0.1
max_duration: 10.0

dataloader_params:
batch_size: ${batch_size}
drop_last: true
num_workers: 8

validation_ds:
dataset:
_target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset
dataset_meta: ${val_ds_meta}
sample_rate: ${feature.sample_rate}
speaker_path: ${speaker_path}
featurizers: ${feature.featurizers}
feature_processors:
pitch: ${model.pitch_processor}
energy: ${model.energy_processor}
align_prior_config: ${model.align_prior_config}

dataloader_params:
batch_size: ${batch_size}
drop_last: false
num_workers: 2

input_fft:
_target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
n_layer: 6
n_head: 2
d_model: ${model.symbols_embedding_dim}
d_head: 64
d_inner: 1536
kernel_size: 3
dropout: 0.1
dropatt: 0.1
dropemb: 0.0
d_embed: ${model.symbols_embedding_dim}

output_fft:
_target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
n_layer: 6
n_head: 1
d_model: ${model.symbols_embedding_dim}
d_head: 64
d_inner: 1536
kernel_size: 3
dropout: 0.1
dropatt: 0.1
dropemb: 0.0

alignment_module:
_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
n_text_channels: ${model.symbols_embedding_dim}

duration_predictor:
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
input_size: ${model.symbols_embedding_dim}
kernel_size: 3
filter_size: 256
dropout: 0.1
n_layers: 2

pitch_predictor:
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
input_size: ${model.symbols_embedding_dim}
kernel_size: 3
filter_size: 256
dropout: 0.1
n_layers: 2

energy_predictor:
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
input_size: ${model.symbols_embedding_dim}
kernel_size: 3
filter_size: 256
dropout: 0.1
n_layers: 2

optim:
name: adamw
lr: 1e-3
betas: [0.9, 0.999]
weight_decay: 1e-6

sched:
name: NoamAnnealing
warmup_steps: 1000
last_epoch: -1
d_model: 1 # Disable scaling based on model dim

trainer:
num_nodes: 1
devices: 1
accelerator: gpu
strategy: ddp
precision: 16
max_epochs: ${max_epochs}
accumulate_grad_batches: 1
gradient_clip_val: 10.0
enable_checkpointing: false # Provided by exp_manager
logger: false # Provided by exp_manager
log_every_n_steps: 100
check_val_every_n_epoch: 10
benchmark: false

exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: val_loss
resume_if_exists: false
resume_ignore_no_checkpoint: false
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,25 @@ hop_length: 256

mel_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
sample_rate: ${..sample_rate}
win_length: ${..win_length}
hop_length: ${..hop_length}
redoctopus marked this conversation as resolved.
Show resolved Hide resolved
mel_dim: 80
lowfreq: 0
highfreq: 8000

pitch_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
sample_rate: ${..sample_rate}
win_length: ${..win_length}
hop_length: ${..hop_length}
pitch_fmin: 60
pitch_fmax: 640

energy_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer
spec_featurizer: ${mel_feature}
spec_featurizer: ${..mel_feature}

featurizers:
pitch: ${pitch_feature}
energy: ${energy_feature}
pitch: ${..pitch_feature}
energy: ${..energy_feature}
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,25 @@ hop_length: 512

mel_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
sample_rate: ${..sample_rate}
win_length: ${..win_length}
hop_length: ${..hop_length}
mel_dim: 80
lowfreq: 0
highfreq: null

pitch_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
sample_rate: ${..sample_rate}
win_length: ${..win_length}
hop_length: ${..hop_length}
pitch_fmin: 60
pitch_fmax: 640

energy_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer
spec_featurizer: ${mel_feature}
spec_featurizer: ${..mel_feature}

featurizers:
pitch: ${pitch_feature}
energy: ${energy_feature}
pitch: ${..pitch_feature}
energy: ${..energy_feature}
Loading