Merge pull request #3 from azraelkuan/master

fix params for kpnet
maum-ai · Aug 31, 2021 · ebee241 · ebee241
2 parents df77c9a + 6926d47
commit ebee241
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -129,19 +129,14 @@ python inference.py -p CHECKPOINT_PATH -i INPUT_MEL_PATH
 
 You can download checkpoint of pre-trained model from **[Google Drive](https://drive.google.com/file/d/1pZV5q59FfIV4sUp9JobXAUjK4-Y-x2q_/view?usp=sharing)**.
 The model was trained on LibriTTS train-clean-360 split.
+> please use `config/legacy.yaml` to load the pre-trained model. for new version, we have fixed the problem of mismatched params.
 
 ## Results
 
 See audio samples at https://mindslab-ai.github.io/univnet/
 
 <img src="docs/loss.png" width="100%">
 
-## Note
-
-This code is an unofficial implementation, there may be some differences from the original paper.
-
-- Our UnivNet generator has smaller number of parameters (c32: 5.11M, c16: 1.42M) than the paper (c32: 14.89M, c16: 4.00M). So far, we have not encountered any issues from using a smaller model size. If run into any problem, please report it as an issue.
-
 ## Implementation Authors
 
 Implementation authors are:

diff --git a/config/default.yaml b/config/default.yaml
@@ -33,6 +33,7 @@ gen:
   dilations: [1, 3, 9, 27]
   strides: [8, 8, 4]
   lReLU_slope: 0.2
+  kpnet_conv_size: 3
 #############################
 mpd:
   periods: [2,3,5,7,11]

diff --git a/config/legacy.yaml b/config/legacy.yaml
@@ -0,0 +1,61 @@
+data:
+  train_dir: 'datasets/' # root path of train data (either relative/absoulte path is ok)
+  train_meta: 'metadata/libritts_train_clean_360_audiopath_text_sid_train.txt'  # relative path of metadata file from train_dir
+  val_dir: 'datasets/' # root path of validation data
+  val_meta: 'metadata/libritts_train_clean_360_audiopath_text_sid_val.txt'  # relative path of metadata file from val_dir
+#############################
+train:
+  num_workers: 8
+  batch_size: 32
+  optimizer: 'adam'
+  seed: 1234
+  adam:
+    lr: 0.0001
+    beta1: 0.5
+    beta2: 0.9
+  stft_lamb: 2.5
+  spk_balanced: False # Using balanced sampling for each speaker
+#############################
+audio:
+  n_mel_channels: 100
+  segment_length: 16384 # Should be multiple of 256
+  pad_short: 2000
+  filter_length: 1024
+  hop_length: 256 # WARNING: this can't be changed.
+  win_length: 1024
+  sampling_rate: 24000
+  mel_fmin: 0.0
+  mel_fmax: 12000.0
+#############################
+gen:
+  noise_dim: 64
+  channel_size: 32 # 32 or 16
+  dilations: [1, 3, 9, 27]
+  strides: [8, 8, 4]
+  lReLU_slope: 0.2
+  kpnet_conv_size: 1
+#############################
+mpd:
+  periods: [2,3,5,7,11]
+  kernel_size: 5
+  stride: 3
+  use_spectral_norm: False
+  lReLU_slope: 0.2
+#############################
+mrd:
+  resolutions: "[(1024, 120, 600), (2048, 240, 1200), (512, 50, 240)]" # (filter_length, hop_length, win_length)
+  use_spectral_norm: False
+  lReLU_slope: 0.2
+#############################
+dist_config:
+  dist_backend: "nccl"
+  dist_url: "tcp://localhost:54321"
+  world_size: 1
+#############################
+log:
+  summary_interval: 1
+  validation_interval: 1
+  save_interval: 1
+  num_audio: 5
+  chkpt_dir: 'chkpt'
+  log_dir: 'logs'
diff --git a/model/generator.py b/model/generator.py
@@ -14,6 +14,7 @@ def __init__(self, hp):
         self.noise_dim = hp.gen.noise_dim
         self.hop_length = hp.audio.hop_length
         channel_size = hp.gen.channel_size
+        kpnet_conv_size = hp.gen.kpnet_conv_size
 
         self.res_stack = nn.ModuleList()
         hop_length = 1
@@ -26,7 +27,8 @@ def __init__(self, hp):
                     stride=stride,
                     dilations=hp.gen.dilations,
                     lReLU_slope=hp.gen.lReLU_slope,
-                    cond_hop_length=hop_length
+                    cond_hop_length=hop_length,
+                    kpnet_conv_size=kpnet_conv_size
                 )
             )
 

diff --git a/model/lvcnet.py b/model/lvcnet.py
@@ -107,7 +107,7 @@ def __init__(
             conv_kernel_size=3,
             cond_hop_length=256,
             kpnet_hidden_channels=64,
-            kpnet_conv_size=1,
+            kpnet_conv_size=3,
             kpnet_dropout=0.0,
         ):
         super().__init__()