From f784d5f542b6d417b44e71830edd5a2c01411309 Mon Sep 17 00:00:00 2001 From: kuan chen Date: Tue, 31 Aug 2021 14:06:04 +0800 Subject: [PATCH 1/2] fix params in paper --- README.md | 7 +------ config/default.yaml | 1 + model/generator.py | 4 +++- model/lvcnet.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index def5227..c537574 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,7 @@ python inference.py -p CHECKPOINT_PATH -i INPUT_MEL_PATH You can download checkpoint of pre-trained model from **[Google Drive](https://drive.google.com/file/d/1pZV5q59FfIV4sUp9JobXAUjK4-Y-x2q_/view?usp=sharing)**. The model was trained on LibriTTS train-clean-360 split. +> please use `config/legacy.yaml` to load the pre-trained model. for new version, we have fixed the problem of mismatched params. ## Results @@ -136,12 +137,6 @@ See audio samples at https://mindslab-ai.github.io/univnet/ -## Note - -This code is an unofficial implementation, there may be some differences from the original paper. - -- Our UnivNet generator has smaller number of parameters (c32: 5.11M, c16: 1.42M) than the paper (c32: 14.89M, c16: 4.00M). So far, we have not encountered any issues from using a smaller model size. If run into any problem, please report it as an issue. - ## Implementation Authors Implementation authors are: diff --git a/config/default.yaml b/config/default.yaml index d843913..abacd54 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -33,6 +33,7 @@ gen: dilations: [1, 3, 9, 27] strides: [8, 8, 4] lReLU_slope: 0.2 + kpnet_conv_size: 3 ############################# mpd: periods: [2,3,5,7,11] diff --git a/model/generator.py b/model/generator.py index 23d85b1..9e07bc8 100644 --- a/model/generator.py +++ b/model/generator.py @@ -14,6 +14,7 @@ def __init__(self, hp): self.noise_dim = hp.gen.noise_dim self.hop_length = hp.audio.hop_length channel_size = hp.gen.channel_size + kpnet_conv_size = hp.gen.kpnet_conv_size self.res_stack = nn.ModuleList() hop_length = 1 @@ -26,7 +27,8 @@ def __init__(self, hp): stride=stride, dilations=hp.gen.dilations, lReLU_slope=hp.gen.lReLU_slope, - cond_hop_length=hop_length + cond_hop_length=hop_length, + kpnet_conv_size=kpnet_conv_size ) ) diff --git a/model/lvcnet.py b/model/lvcnet.py index be9fc90..01b9168 100644 --- a/model/lvcnet.py +++ b/model/lvcnet.py @@ -107,7 +107,7 @@ def __init__( conv_kernel_size=3, cond_hop_length=256, kpnet_hidden_channels=64, - kpnet_conv_size=1, + kpnet_conv_size=3, kpnet_dropout=0.0, ): super().__init__() From 6926d47f944d1954233fc91e92c1dd17e44871e9 Mon Sep 17 00:00:00 2001 From: kuan chen Date: Tue, 31 Aug 2021 14:09:31 +0800 Subject: [PATCH 2/2] add legacy.yaml --- config/legacy.yaml | 61 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 config/legacy.yaml diff --git a/config/legacy.yaml b/config/legacy.yaml new file mode 100644 index 0000000..30e598d --- /dev/null +++ b/config/legacy.yaml @@ -0,0 +1,61 @@ +data: + train_dir: 'datasets/' # root path of train data (either relative/absoulte path is ok) + train_meta: 'metadata/libritts_train_clean_360_audiopath_text_sid_train.txt' # relative path of metadata file from train_dir + val_dir: 'datasets/' # root path of validation data + val_meta: 'metadata/libritts_train_clean_360_audiopath_text_sid_val.txt' # relative path of metadata file from val_dir +############################# +train: + num_workers: 8 + batch_size: 32 + optimizer: 'adam' + seed: 1234 + adam: + lr: 0.0001 + beta1: 0.5 + beta2: 0.9 + stft_lamb: 2.5 + spk_balanced: False # Using balanced sampling for each speaker +############################# +audio: + n_mel_channels: 100 + segment_length: 16384 # Should be multiple of 256 + pad_short: 2000 + filter_length: 1024 + hop_length: 256 # WARNING: this can't be changed. + win_length: 1024 + sampling_rate: 24000 + mel_fmin: 0.0 + mel_fmax: 12000.0 +############################# +gen: + noise_dim: 64 + channel_size: 32 # 32 or 16 + dilations: [1, 3, 9, 27] + strides: [8, 8, 4] + lReLU_slope: 0.2 + kpnet_conv_size: 1 +############################# +mpd: + periods: [2,3,5,7,11] + kernel_size: 5 + stride: 3 + use_spectral_norm: False + lReLU_slope: 0.2 +############################# +mrd: + resolutions: "[(1024, 120, 600), (2048, 240, 1200), (512, 50, 240)]" # (filter_length, hop_length, win_length) + use_spectral_norm: False + lReLU_slope: 0.2 +############################# +dist_config: + dist_backend: "nccl" + dist_url: "tcp://localhost:54321" + world_size: 1 +############################# +log: + summary_interval: 1 + validation_interval: 1 + save_interval: 1 + num_audio: 5 + chkpt_dir: 'chkpt' + log_dir: 'logs'