forked from espnet/espnet
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request espnet#4364 from imdanboy/master
add e2e tts model: JETS
- Loading branch information
Showing
11 changed files
with
3,351 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
# This configuration is for ESPnet2 to train JETS, which | ||
# is truely end-to-end text-to-waveform model. To run | ||
# this config, you need to specify "--tts_task gan_tts" | ||
# option for tts.sh at least and use 24000 hz audio as | ||
# the training data (mainly tested on LJspeech). | ||
# This configuration tested on 4 GPUs (V100) with 32GB GPU | ||
# memory. It takes around 2 weeks to finish the training | ||
# but 100k iters model should generate reasonable results. | ||
|
||
########################################################## | ||
# TTS MODEL SETTING # | ||
########################################################## | ||
tts: jets | ||
tts_conf: | ||
# generator related | ||
generator_type: jets_generator | ||
generator_params: | ||
adim: 256 # attention dimension | ||
aheads: 2 # number of attention heads | ||
elayers: 4 # number of encoder layers | ||
eunits: 1024 # number of encoder ff units | ||
dlayers: 4 # number of decoder layers | ||
dunits: 1024 # number of decoder ff units | ||
positionwise_layer_type: conv1d # type of position-wise layer | ||
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer | ||
duration_predictor_layers: 2 # number of layers of duration predictor | ||
duration_predictor_chans: 256 # number of channels of duration predictor | ||
duration_predictor_kernel_size: 3 # filter size of duration predictor | ||
use_masking: True # whether to apply masking for padded part in loss calculation | ||
encoder_normalize_before: True # whether to perform layer normalization before the input | ||
decoder_normalize_before: True # whether to perform layer normalization before the input | ||
encoder_type: transformer # encoder type | ||
decoder_type: transformer # decoder type | ||
conformer_rel_pos_type: latest # relative positional encoding type | ||
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type | ||
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type | ||
conformer_activation_type: swish # conformer activation type | ||
use_macaron_style_in_conformer: true # whether to use macaron style in conformer | ||
use_cnn_in_conformer: true # whether to use CNN in conformer | ||
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder | ||
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder | ||
init_type: xavier_uniform # initialization type | ||
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer | ||
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding | ||
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer | ||
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer | ||
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding | ||
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer | ||
pitch_predictor_layers: 5 # number of conv layers in pitch predictor | ||
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor | ||
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor | ||
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor | ||
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch | ||
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch | ||
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder | ||
energy_predictor_layers: 2 # number of conv layers in energy predictor | ||
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor | ||
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor | ||
energy_predictor_dropout: 0.5 # dropout rate in energy predictor | ||
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy | ||
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy | ||
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder | ||
generator_out_channels: 1 | ||
generator_channels: 512 | ||
generator_global_channels: -1 | ||
generator_kernel_size: 7 | ||
generator_upsample_scales: [8, 8, 2, 2] | ||
generator_upsample_kernel_sizes: [16, 16, 4, 4] | ||
generator_resblock_kernel_sizes: [3, 7, 11] | ||
generator_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] | ||
generator_use_additional_convs: true | ||
generator_bias: true | ||
generator_nonlinear_activation: "LeakyReLU" | ||
generator_nonlinear_activation_params: | ||
negative_slope: 0.1 | ||
generator_use_weight_norm: true | ||
segment_size: 64 # segment size for random windowed discriminator | ||
|
||
# discriminator related | ||
discriminator_type: hifigan_multi_scale_multi_period_discriminator | ||
discriminator_params: | ||
scales: 1 | ||
scale_downsample_pooling: "AvgPool1d" | ||
scale_downsample_pooling_params: | ||
kernel_size: 4 | ||
stride: 2 | ||
padding: 2 | ||
scale_discriminator_params: | ||
in_channels: 1 | ||
out_channels: 1 | ||
kernel_sizes: [15, 41, 5, 3] | ||
channels: 128 | ||
max_downsample_channels: 1024 | ||
max_groups: 16 | ||
bias: True | ||
downsample_scales: [2, 2, 4, 4, 1] | ||
nonlinear_activation: "LeakyReLU" | ||
nonlinear_activation_params: | ||
negative_slope: 0.1 | ||
use_weight_norm: True | ||
use_spectral_norm: False | ||
follow_official_norm: False | ||
periods: [2, 3, 5, 7, 11] | ||
period_discriminator_params: | ||
in_channels: 1 | ||
out_channels: 1 | ||
kernel_sizes: [5, 3] | ||
channels: 32 | ||
downsample_scales: [3, 3, 3, 3, 1] | ||
max_downsample_channels: 1024 | ||
bias: True | ||
nonlinear_activation: "LeakyReLU" | ||
nonlinear_activation_params: | ||
negative_slope: 0.1 | ||
use_weight_norm: True | ||
use_spectral_norm: False | ||
|
||
# loss function related | ||
generator_adv_loss_params: | ||
average_by_discriminators: false # whether to average loss value by #discriminators | ||
loss_type: mse # loss type, "mse" or "hinge" | ||
discriminator_adv_loss_params: | ||
average_by_discriminators: false # whether to average loss value by #discriminators | ||
loss_type: mse # loss type, "mse" or "hinge" | ||
feat_match_loss_params: | ||
average_by_discriminators: false # whether to average loss value by #discriminators | ||
average_by_layers: false # whether to average loss value by #layers of each discriminator | ||
include_final_outputs: true # whether to include final outputs for loss calculation | ||
mel_loss_params: | ||
fs: 24000 # must be the same as the training data | ||
n_fft: 1024 # fft points | ||
hop_length: 256 # hop size | ||
win_length: null # window length | ||
window: hann # window type | ||
n_mels: 80 # number of Mel basis | ||
fmin: 0 # minimum frequency for Mel basis | ||
fmax: null # maximum frequency for Mel basis | ||
log_base: null # null represent natural log | ||
lambda_adv: 1.0 # loss scaling coefficient for adversarial loss | ||
lambda_mel: 45.0 # loss scaling coefficient for Mel loss | ||
lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss | ||
lambda_var: 1.0 | ||
lambda_align: 2.0 | ||
# others | ||
sampling_rate: 24000 # needed in the inference for saving wav | ||
cache_generator_outputs: true # whether to cache generator outputs in the training | ||
|
||
# extra module for additional inputs | ||
pitch_extract: dio # pitch extractor type | ||
pitch_extract_conf: | ||
reduction_factor: 1 | ||
use_token_averaged_f0: false | ||
pitch_normalize: global_mvn # normalizer for the pitch feature | ||
energy_extract: energy # energy extractor type | ||
energy_extract_conf: | ||
reduction_factor: 1 | ||
use_token_averaged_energy: false | ||
energy_normalize: global_mvn # normalizer for the energy feature | ||
|
||
########################################################## | ||
# OPTIMIZER & SCHEDULER SETTING # | ||
########################################################## | ||
# optimizer setting for generator | ||
optim: adamw | ||
optim_conf: | ||
lr: 2.0e-4 | ||
betas: [0.8, 0.99] | ||
eps: 1.0e-9 | ||
weight_decay: 0.0 | ||
scheduler: exponentiallr | ||
scheduler_conf: | ||
gamma: 0.999875 | ||
# optimizer setting for discriminator | ||
optim2: adamw | ||
optim2_conf: | ||
lr: 2.0e-4 | ||
betas: [0.8, 0.99] | ||
eps: 1.0e-9 | ||
weight_decay: 0.0 | ||
scheduler2: exponentiallr | ||
scheduler2_conf: | ||
gamma: 0.999875 | ||
generator_first: true # whether to start updating generator first | ||
|
||
########################################################## | ||
# OTHER TRAINING SETTING # | ||
########################################################## | ||
num_iters_per_epoch: 1000 # number of iterations per epoch | ||
max_epoch: 1000 # number of epochs | ||
accum_grad: 1 # gradient accumulation | ||
batch_bins: 2000000 # batch bins (feats_type=raw) | ||
batch_type: numel # how to make batch | ||
grad_clip: -1 # gradient clipping norm | ||
grad_noise: false # whether to use gradient noise injection | ||
sort_in_batch: descending # how to sort data in making batch | ||
sort_batch: descending # how to sort created batches | ||
num_workers: 4 # number of workers of data loader | ||
use_amp: false # whether to use pytorch amp | ||
log_interval: 50 # log interval in iterations | ||
keep_nbest_models: 5 # number of models to keep | ||
num_att_plot: 3 # number of attention figures to be saved in every check | ||
seed: 777 # random seed number | ||
patience: null # patience for early stopping | ||
unused_parameters: true # needed for multi gpu case | ||
best_model_criterion: # criterion to save the best models | ||
- - valid | ||
- text2mel_loss | ||
- min | ||
- - train | ||
- text2mel_loss | ||
- min | ||
- - train | ||
- total_count | ||
- max | ||
cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic | ||
# in the case of GAN-TTS training, we strongly recommend setting to false | ||
cudnn_benchmark: false # setting to true might acdelerate the training speed but sometimes decrease it | ||
# therefore, we set to false as a default (recommend trying both cases) |
Oops, something went wrong.