coqui-ai · erogol · Jul 12, 2022 · May 3, 2022 · May 3, 2022 · May 3, 2022
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -11,4 +11,5 @@ recursive-include TTS *.md
 recursive-include TTS *.py
 recursive-include TTS *.pyx
 recursive-include images *.png
-
+recursive-exclude tests *
+prune tests*
diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py
@@ -2,7 +2,7 @@
 from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
-from TTS.tts.models.vits import VitsArgs
+from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
 
 
 @dataclass
@@ -16,6 +16,9 @@ class VitsConfig(BaseTTSConfig):
  model_args (VitsArgs):
  Model architecture arguments. Defaults to `VitsArgs()`.
 
+ audio (VitsAudioConfig):
+ Audio processing configuration. Defaults to `VitsAudioConfig()`.
+
  grad_clip (List):
  Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
 
@@ -94,6 +97,7 @@ class VitsConfig(BaseTTSConfig):
  model: str = "vits"
  # model specific params
  model_args: VitsArgs = field(default_factory=VitsArgs)
+ audio: VitsAudioConfig = VitsAudioConfig()
 
  # optimizer
  grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])

diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
@@ -137,7 +137,7 @@ def forward(self, y_hat, y, length):
 
  if ssim_loss.item() < 0.0:
  print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 0.0")
- ssim_loss =  torch.tensor([0.0])
+ ssim_loss = torch.tensor([0.0])
 
  return ssim_loss
 

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
@@ -200,6 +200,22 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm
  return spec
 
 
+#############################
+# CONFIGS
+#############################
+
+
+@dataclass
+class VitsAudioConfig(Coqpit):
+ fft_size: int = 1024
+ sample_rate: int = 22050
+ win_length: int = 1024
+ hop_length: int = 256
+ num_mels: int = 80
+ mel_fmin: int = 0
+ mel_fmax: int = None
+
+
 ##############################
 # DATASET
 ##############################

diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py
@@ -16,9 +16,9 @@ def _reduce(x: torch.Tensor, reduction: str = "mean") -> torch.Tensor:
  """
  if reduction == "none":
  return x
- elif reduction == "mean":
+ if reduction == "mean":
  return x.mean(dim=0)
- elif reduction == "sum":
+ if reduction == "sum":
  return x.sum(dim=0)
  raise ValueError("Unknown reduction. Expected one of {'none', 'mean', 'sum'}")
 

diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
@@ -307,7 +307,7 @@ def tts(
  waveform = waveform.squeeze()
 
  # trim silence
- if self.tts_config.audio["do_trim_silence"] is True:
+ if "do_trim_silence" in self.tts_config.audio and self.tts_config.audio["do_trim_silence"]:
  waveform = trim_silence(waveform, self.tts_model.ap)
 
  wavs += list(waveform)

diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@@ -54,7 +54,6 @@
  print_step=50,
  print_eval=False,
  mixed_precision=False,
- sort_by_audio_len=True,
  max_seq_len=500000,
  output_path=output_path,
  datasets=[dataset_config],

diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py
@@ -53,7 +53,6 @@
  print_step=50,
  print_eval=False,
  mixed_precision=False,
- sort_by_audio_len=True,
  max_seq_len=500000,
  output_path=output_path,
  datasets=[dataset_config],

diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@@ -46,7 +46,6 @@
  print_step=50,
  print_eval=False,
  mixed_precision=False,
- sort_by_audio_len=True,
  max_seq_len=500000,
  output_path=output_path,
  datasets=[dataset_config],

diff --git a/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py b/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
@@ -68,7 +68,6 @@
  print_step=25,
  print_eval=True,
  mixed_precision=False,
- sort_by_audio_len=True,
  seq_len_norm=True,
  output_path=output_path,
  datasets=[dataset_config],

diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py
@@ -2,33 +2,19 @@
 
 from trainer import Trainer, TrainerArgs
 
-from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import Vits
+from TTS.tts.models.vits import Vits, VitsAudioConfig
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
  name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
-audio_config = BaseAudioConfig(
- sample_rate=22050,
- win_length=1024,
- hop_length=256,
- num_mels=80,
- preemphasis=0.0,
- ref_level_db=20,
- log_func="np.log",
- do_trim_silence=True,
- trim_db=45,
- mel_fmin=0,
- mel_fmax=None,
- spec_gain=1.0,
- signal_norm=False,
- do_amp_to_db_linear=False,
+audio_config = VitsAudioConfig(
+ sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
 )
 
 config = VitsConfig(
@@ -37,7 +23,7 @@
  batch_size=32,
  eval_batch_size=16,
  batch_group_size=5,
- num_loader_workers=0,
+ num_loader_workers=8,
  num_eval_loader_workers=4,
  run_eval=True,
  test_delay_epochs=-1,
@@ -52,6 +38,7 @@
  mixed_precision=True,
  output_path=output_path,
  datasets=[dataset_config],
+ cudnn_benchmark=False,
 )
 
 # INITIALIZE THE AUDIO PROCESSOR

diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py
@@ -3,11 +3,10 @@
 
 from trainer import Trainer, TrainerArgs
 
-from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
@@ -22,22 +21,13 @@
  for path in dataset_paths
 ]
 
-audio_config = BaseAudioConfig(
+audio_config = VitsAudioConfig(
  sample_rate=16000,
  win_length=1024,
  hop_length=256,
  num_mels=80,
- preemphasis=0.0,
- ref_level_db=20,
- log_func="np.log",
- do_trim_silence=False,
- trim_db=23.0,
  mel_fmin=0,
  mel_fmax=None,
- spec_gain=1.0,
- signal_norm=True,
- do_amp_to_db_linear=False,
- resample=False,
 )
 
 vitsArgs = VitsArgs(
@@ -69,7 +59,6 @@
  use_language_weighted_sampler=True,
  print_eval=False,
  mixed_precision=False,
- sort_by_audio_len=True,
  min_audio_len=32 * 256 * 4,
  max_audio_len=160000,
  output_path=output_path,

diff --git a/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py b/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
@@ -60,7 +60,6 @@
  "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
  "Vor dem 22. November 1963.",
  ],
- sort_by_audio_len=True,
  max_seq_len=500000,
  output_path=output_path,
  datasets=[dataset_config],

diff --git a/recipes/thorsten_DE/vits_tts/train_vits.py b/recipes/thorsten_DE/vits_tts/train_vits.py
@@ -2,11 +2,10 @@
 
 from trainer import Trainer, TrainerArgs
 
-from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import Vits
+from TTS.tts.models.vits import Vits, VitsAudioConfig
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
@@ -21,21 +20,13 @@
  print("Downloading dataset")
  download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
 
-audio_config = BaseAudioConfig(
+audio_config = VitsAudioConfig(
  sample_rate=22050,
  win_length=1024,
  hop_length=256,
  num_mels=80,
- preemphasis=0.0,
- ref_level_db=20,
- log_func="np.log",
- do_trim_silence=True,
- trim_db=45,
  mel_fmin=0,
  mel_fmax=None,
- spec_gain=1.0,
- signal_norm=False,
- do_amp_to_db_linear=False,
 )
 
 config = VitsConfig(

diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py
@@ -2,11 +2,10 @@
 
 from trainer import Trainer, TrainerArgs
 
-from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import Vits, VitsArgs
+from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
@@ -17,22 +16,8 @@
 )
 
 
-audio_config = BaseAudioConfig(
- sample_rate=22050,
- win_length=1024,
- hop_length=256,
- num_mels=80,
- preemphasis=0.0,
- ref_level_db=20,
- log_func="np.log",
- do_trim_silence=True,
- trim_db=23.0,
- mel_fmin=0,
- mel_fmax=None,
- spec_gain=1.0,
- signal_norm=False,
- do_amp_to_db_linear=False,
- resample=True,
+audio_config = VitsAudioConfig(
+ sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
 )
 
 vitsArgs = VitsArgs(
@@ -62,6 +47,7 @@
  max_text_len=325, # change this if you have a larger VRAM than 16GB
  output_path=output_path,
  datasets=[dataset_config],
+ cudnn_benchmark=False,
 )
 
 # INITIALIZE THE AUDIO PROCESSOR

diff --git a/setup.py b/setup.py
@@ -90,7 +90,7 @@ def pip_install(package_name):
  # ext_modules=find_cython_extensions(),
  # package
  include_package_data=True,
- packages=find_packages(include=["TTS*"]),
+ packages=find_packages(include=["TTS"], exclude=["*.tests", "*tests.*", "tests.*", "*tests", "tests"]),
  package_data={
  "TTS": [
  "VERSION",

diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
@@ -9,7 +9,17 @@
 from TTS.config import load_config
 from TTS.encoder.utils.generic_utils import setup_encoder_model
 from TTS.tts.configs.vits_config import VitsConfig
-from TTS.tts.models.vits import Vits, VitsArgs, amp_to_db, db_to_amp, load_audio, spec_to_mel, wav_to_mel, wav_to_spec
+from TTS.tts.models.vits import (
+ Vits,
+ VitsArgs,
+ VitsAudioConfig,
+ amp_to_db,
+ db_to_amp,
+ load_audio,
+ spec_to_mel,
+ wav_to_mel,
+ wav_to_spec,
+)
 from TTS.tts.utils.speakers import SpeakerManager
 
 LANG_FILE = os.path.join(get_tests_input_path(), "language_ids.json")
@@ -421,16 +431,18 @@ def test_train_step(self):
  self._check_parameter_changes(model, model_ref)
 
  def test_train_step_upsampling(self):
+ """Upsampling by the decoder upsampling layers"""
  # setup the model
  with torch.autograd.set_detect_anomaly(True):
+ audio_config = VitsAudioConfig(sample_rate=22050)
  model_args = VitsArgs(
  num_chars=32,
  spec_segment_size=10,
  encoder_sample_rate=11025,
  interpolate_z=False,
  upsample_rates_decoder=[8, 8, 4, 2],
  )
- config = VitsConfig(model_args=model_args)
+ config = VitsConfig(model_args=model_args, audio=audio_config)
  model = Vits(config).to(device)
  model.train()
  # model to train
@@ -459,10 +471,18 @@ def test_train_step_upsampling(self):
  self._check_parameter_changes(model, model_ref)
 
  def test_train_step_upsampling_interpolation(self):
+ """Upsampling by interpolation"""
  # setup the model
  with torch.autograd.set_detect_anomaly(True):
- model_args = VitsArgs(num_chars=32, spec_segment_size=10, encoder_sample_rate=11025, interpolate_z=True)
- config = VitsConfig(model_args=model_args)
+ audio_config = VitsAudioConfig(sample_rate=22050)
+ model_args = VitsArgs(
+ num_chars=32,
+ spec_segment_size=10,
+ encoder_sample_rate=11025,
+ interpolate_z=True,
+ upsample_rates_decoder=[8, 8, 2, 2],
+ )
+ config = VitsConfig(model_args=model_args, audio=audio_config)
  model = Vits(config).to(device)
  model.train()
  # model to train