Merge pull request espnet#4369 from kan-bayashi/minor_fix_jets

chintu619 · May 17, 2022 · e0e0620 · e0e0620
2 parents df053b8 + 2cfbbd3
commit e0e0620
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 84 deletions.
diff --git a/README.md b/README.md
@@ -97,14 +97,15 @@ Demonstration
     - FastSpeech2
     - Conformer FastSpeech & FastSpeech2
     - VITS
+    - JETS
 - Multi-speaker & multi-language extention
     - Pretrained speaker embedding (e.g., X-vector)
     - Speaker ID embedding
     - Language ID embedding
     - Global style token (GST) embedding
     - Mix of the above embeddings
 - End-to-end training
-    - End-to-end text-to-wav model (e.g., VITS)
+    - End-to-end text-to-wav model (e.g., VITS, JETS, etc.)
     - Joint training of text2mel and vocoder
 - Various language support
     - En / Jp / Zn / De / Ru / And more...
@@ -126,7 +127,7 @@ To train the neural vocoder, please check the following repositories:
 
 > **NOTE**:
 > - We are moving on ESPnet2-based development for TTS.
-> - If you are beginner, we recommend using [ESPnet2-TTS](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1).
+> - The use of ESPnet1-TTS is deprecated, please use [ESPnet2-TTS](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1).
 
 ### SE: Speech enhancement (and separation)
 

diff --git a/egs2/TEMPLATE/tts1/README.md b/egs2/TEMPLATE/tts1/README.md
@@ -726,6 +726,7 @@ You can train the following models by changing `*.yaml` config for `--train_conf
 - [FastSpeech2](https://arxiv.org/abs/2006.04558) ([FastPitch](https://arxiv.org/abs/2006.06873))
 - [Conformer](https://arxiv.org/abs/2005.08100)-based FastSpeech / FastSpeech2
 - [VITS](https://arxiv.org/abs/2106.06103)
+- [JETS](https://arxiv.org/abs/2203.16852)
 
 You can find example configs of the above models in [`egs2/ljspeech/tts1/conf/tuning`](../../ljspeech/tts1/conf/tuning).
 
@@ -742,6 +743,11 @@ You can find example configs of the above models in:
 - [`egs2/vctk/tts1/conf/tuning`](../../vctk/tts1/conf/tuning).
 - [`egs2/libritts/tts1/conf/tuning`](../../vctk/libritts/conf/tuning).
 
+And now we support other toolkit's xvector.
+Please check the following options.
+
+https://github.com/espnet/espnet/blob/df053b8c13c26fe289fc882751801fd781e9d43e/egs2/TEMPLATE/tts1/tts.sh#L69-L71
+
 ## FAQ
 
 ### ESPnet1 model is compatible with ESPnet2?

diff --git a/test/espnet2/gan_tts/jets/test_jets.py b/test/espnet2/gan_tts/jets/test_jets.py
@@ -188,8 +188,8 @@ def make_jets_loss_args(**kwargs):
 
 # NOTE(kan-bayashi): first forward requires jit compile
 #   so a little bit more time is needed to run. Therefore,
-#   here we extend execution timeout from 2 sec to 5 sec.
-@pytest.mark.execution_timeout(5)
+#   here we extend execution timeout from 2 sec to 8 sec.
+@pytest.mark.execution_timeout(8)
 @pytest.mark.skipif(
     "1.6" in torch.__version__,
     reason="group conv in pytorch 1.6 has an issue. "
@@ -251,46 +251,6 @@ def make_jets_loss_args(**kwargs):
             },
             {},
         ),
-        (
-            {},
-            {
-                "discriminator_type": "hifigan_period_discriminator",
-                "discriminator_params": {
-                    "period": 2,
-                    "in_channels": 1,
-                    "out_channels": 1,
-                    "kernel_sizes": [5, 3],
-                    "channels": 16,
-                    "downsample_scales": [3, 3, 1],
-                    "max_downsample_channels": 32,
-                    "bias": True,
-                    "nonlinear_activation": "LeakyReLU",
-                    "nonlinear_activation_params": {"negative_slope": 0.1},
-                    "use_weight_norm": True,
-                    "use_spectral_norm": False,
-                },
-            },
-            {},
-        ),
-        (
-            {},
-            {
-                "discriminator_type": "hifigan_scale_discriminator",
-                "discriminator_params": {
-                    "in_channels": 1,
-                    "out_channels": 1,
-                    "kernel_sizes": [15, 41, 5, 3],
-                    "channels": 16,
-                    "max_downsample_channels": 32,
-                    "max_groups": 16,
-                    "bias": True,
-                    "downsample_scales": [2, 2, 1],
-                    "nonlinear_activation": "LeakyReLU",
-                    "nonlinear_activation_params": {"negative_slope": 0.1},
-                },
-            },
-            {},
-        ),
         (
             {},
             {},
@@ -442,46 +402,6 @@ def test_jets_is_trainable_and_decodable(gen_dict, dis_dict, loss_dict):
             },
             {},
         ),
-        (
-            {},
-            {
-                "discriminator_type": "hifigan_period_discriminator",
-                "discriminator_params": {
-                    "period": 2,
-                    "in_channels": 1,
-                    "out_channels": 1,
-                    "kernel_sizes": [5, 3],
-                    "channels": 16,
-                    "downsample_scales": [3, 3, 1],
-                    "max_downsample_channels": 32,
-                    "bias": True,
-                    "nonlinear_activation": "LeakyReLU",
-                    "nonlinear_activation_params": {"negative_slope": 0.1},
-                    "use_weight_norm": True,
-                    "use_spectral_norm": False,
-                },
-            },
-            {},
-        ),
-        (
-            {},
-            {
-                "discriminator_type": "hifigan_scale_discriminator",
-                "discriminator_params": {
-                    "in_channels": 1,
-                    "out_channels": 1,
-                    "kernel_sizes": [15, 41, 5, 3],
-                    "channels": 16,
-                    "max_downsample_channels": 32,
-                    "max_groups": 16,
-                    "bias": True,
-                    "downsample_scales": [2, 2, 1],
-                    "nonlinear_activation": "LeakyReLU",
-                    "nonlinear_activation_params": {"negative_slope": 0.1},
-                },
-            },
-            {},
-        ),
         (
             {},
             {},