coqui-ai · erogol · Nov 16, 2021 · Nov 16, 2021 · Nov 16, 2021 · Nov 16, 2021
diff --git a/.pylintrc b/.pylintrc
@@ -168,7 +168,8 @@ disable=missing-docstring,
  exception-escape,
  comprehension-escape,
  duplicate-code,
- not-callable
+ not-callable,
+ import-outside-toplevel
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
@@ -13,28 +13,28 @@
 from TTS.tts.datasets import TTSDataset, load_tts_samples
 from TTS.tts.models import setup_model
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import count_parameters
 
 use_cuda = torch.cuda.is_available()
 
 
 def setup_loader(ap, r, verbose=False):
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
  dataset = TTSDataset(
- r,
- c.text_cleaner,
+ outputs_per_step=r,
  compute_linear_spec=False,
- meta_data=meta_data,
+ samples=meta_data,
+ tokenizer=tokenizer,
  ap=ap,
- characters=c.characters if "characters" in c.keys() else None,
- add_blank=c["add_blank"] if "add_blank" in c.keys() else False,
  batch_group_size=0,
- min_seq_len=c.min_seq_len,
- max_seq_len=c.max_seq_len,
+ min_text_len=c.min_text_len,
+ max_text_len=c.max_text_len,
+ min_audio_len=c.min_audio_len,
+ max_audio_len=c.max_audio_len,
  phoneme_cache_path=c.phoneme_cache_path,
- use_phonemes=c.use_phonemes,
- phoneme_language=c.phoneme_language,
- enable_eos_bos=c.enable_eos_bos_chars,
+ precompute_num_workers=0,
  use_noise_augment=False,
  verbose=verbose,
  speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None,
@@ -44,7 +44,7 @@ def setup_loader(ap, r, verbose=False):
  if c.use_phonemes and c.compute_input_seq_cache:
  # precompute phonemes to have a better estimate of sequence lengths.
  dataset.compute_input_seq(c.num_loader_workers)
- dataset.sort_and_filter_items(c.get("sort_by_audio_len", default=False))
+ dataset.preprocess_samples()
 
  loader = DataLoader(
  dataset,
@@ -75,8 +75,8 @@ def set_filename(wav_path, out_path):
 
 def format_data(data):
  # setup input data
- text_input = data["text"]
- text_lengths = data["text_lengths"]
+ text_input = data["token_id"]
+ text_lengths = data["token_id_lengths"]
  mel_input = data["mel"]
  mel_lengths = data["mel_lengths"]
  item_idx = data["item_idxs"]

diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py
@@ -7,14 +7,15 @@
 
 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.text import text2phone
+from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
+
+phonemizer = Gruut(language="en-us")
 
 
 def compute_phonemes(item):
  try:
  text = item[0]
- language = item[-1]
- ph = text2phone(text, language, use_espeak_phonemes=c.use_espeak_phonemes).split("|")
+ ph = phonemizer.phonemize(text).split("|")
  except:
  return []
  return list(set(ph))

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
@@ -26,6 +26,7 @@ def resample_file(func_args):
  --input_dir /root/LJSpeech-1.1/
  --output_sr 22050
  --output_dir /root/resampled_LJSpeech-1.1/
+ --file_ext wav
  --n_jobs 24
  """,
  formatter_class=RawTextHelpFormatter,
@@ -55,6 +56,14 @@ def resample_file(func_args):
  help="Path of the destination folder. If not defined, the operation is done in place",
  )
 
+ parser.add_argument(
+ "--file_ext",
+ type=str,
+ default="wav",
+ required=False,
+ help="Extension of the audio files to resample",
+ )
+
  parser.add_argument(
  "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
  )
@@ -67,7 +76,7 @@ def resample_file(func_args):
  args.input_dir = args.output_dir
 
  print("Resampling the audio files...")
- audio_files = glob.glob(os.path.join(args.input_dir, "**/*.wav"), recursive=True)
+ audio_files = glob.glob(os.path.join(args.input_dir, f"**/*.{args.file_ext}"), recursive=True)
  print(f"Found {len(audio_files)} files...")
  audio_files = list(zip(audio_files, len(audio_files) * [args.output_sr]))
  with Pool(processes=args.n_jobs) as p:

diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
@@ -1,12 +1,9 @@
 import os
 
-from TTS.config import check_config_and_model_args, get_from_config_or_model_args, load_config, register_config
+from TTS.config import load_config, register_config
 from TTS.trainer import Trainer, TrainingArgs
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models import setup_model
-from TTS.tts.utils.languages import LanguageManager
-from TTS.tts.utils.speakers import SpeakerManager
-from TTS.utils.audio import AudioProcessor
 
 
 def main():
@@ -42,36 +39,8 @@ def main():
  # load training samples
  train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True)
 
- # setup audio processor
- ap = AudioProcessor(**config.audio)
-
- # init speaker manager
- if check_config_and_model_args(config, "use_speaker_embedding", True):
- speaker_manager = SpeakerManager(data_items=train_samples + eval_samples)
- if hasattr(config, "model_args"):
- config.model_args.num_speakers = speaker_manager.num_speakers
- else:
- config.num_speakers = speaker_manager.num_speakers
- elif check_config_and_model_args(config, "use_d_vector_file", True):
- speaker_manager = SpeakerManager(d_vectors_file_path=get_from_config_or_model_args(config, "d_vector_file"))
- if hasattr(config, "model_args"):
- config.model_args.num_speakers = speaker_manager.num_speakers
- else:
- config.num_speakers = speaker_manager.num_speakers
- else:
- speaker_manager = None
-
- if hasattr(config, "use_language_embedding") and config.use_language_embedding:
- language_manager = LanguageManager(config=config)
- if hasattr(config, "model_args"):
- config.model_args.num_languages = language_manager.num_languages
- else:
- config.num_languages = language_manager.num_languages
- else:
- language_manager = None
-
  # init the model from config
- model = setup_model(config, speaker_manager, language_manager)
+ model = setup_model(config, train_samples + eval_samples)
 
  # init the trainer and 🚀
  trainer = Trainer(
@@ -81,7 +50,6 @@ def main():
  model=model,
  train_samples=train_samples,
  eval_samples=eval_samples,
- training_assets={"audio_processor": ap},
  parse_command_line_args=False,
  )
  trainer.fit()

diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py
@@ -57,6 +57,12 @@ class BaseAudioConfig(Coqpit):
  do_amp_to_db_mel (bool, optional):
  enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
 
+ pitch_fmax (float, optional):
+ Maximum frequency of the F0 frames. Defaults to ```640```.
+
+ pitch_fmin (float, optional):
+ Minimum frequency of the F0 frames. Defaults to ```0```.
+
  trim_db (int):
  Silence threshold used for silence trimming. Defaults to 45.
 
@@ -135,6 +141,9 @@ class BaseAudioConfig(Coqpit):
  spec_gain: int = 20
  do_amp_to_db_linear: bool = True
  do_amp_to_db_mel: bool = True
+ # f0 params
+ pitch_fmax: float = 640.0
+ pitch_fmin: float = 0.0
  # normalization params
  signal_norm: bool = True
  min_level_db: int = -100

diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py
@@ -89,12 +89,9 @@ class FastPitchConfig(BaseTTSConfig):
  pitch_loss_alpha (float):
  Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
 
- binary_loss_alpha (float):
+ binary_align_loss_alpha (float):
  Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
 
- binary_align_loss_start_step (int):
- Start binary alignment loss after this many steps. Defaults to 20000.
-
  min_seq_len (int):
  Minimum input sequence length to be used at training.
 
@@ -129,12 +126,12 @@ class FastPitchConfig(BaseTTSConfig):
  duration_loss_type: str = "mse"
  use_ssim_loss: bool = True
  ssim_loss_alpha: float = 1.0
- dur_loss_alpha: float = 1.0
  spec_loss_alpha: float = 1.0
- pitch_loss_alpha: float = 1.0
  aligner_loss_alpha: float = 1.0
- binary_align_loss_alpha: float = 1.0
- binary_align_loss_start_step: int = 20000
+ pitch_loss_alpha: float = 0.1
+ dur_loss_alpha: float = 0.1
+ binary_align_loss_alpha: float = 0.1
+ binary_loss_warmup_epochs: int = 150
 
  # overrides
  min_seq_len: int = 13

diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py
@@ -153,6 +153,7 @@ class GlowTTSConfig(BaseTTSConfig):
 
  # multi-speaker settings
  use_speaker_embedding: bool = False
+ speakers_file: str = None
  use_d_vector_file: bool = False
  d_vector_file: str = False
 

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
@@ -53,6 +53,10 @@ class CharactersConfig(Coqpit):
  """Defines arguments for the `BaseCharacters` and its subclasses.
 
  Args:
+ characters_class (str):
+ Defines the class of the characters used. If None, we pick ```Phonemes``` or ```Graphemes``` based on
+ the configuration. Defaults to None.
+
  pad (str):
  characters in place of empty padding. Defaults to None.
 
@@ -78,12 +82,13 @@ class CharactersConfig(Coqpit):
 
  is_unique (bool):
  remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
- models trained with character lists with duplicates.
+ models trained with character lists with duplicates. Defaults to True.
 
  is_sorted (bool):
  Sort the characters in alphabetical order. Defaults to True.
  """
 
+ characters_class: str = None
  pad: str = None
  eos: str = None
  bos: str = None
@@ -166,9 +171,16 @@ class BaseTTSConfig(BaseTrainingConfig):
  compute_linear_spec (bool):
  If True data loader computes and returns linear spectrograms alongside the other data.
 
+ precompute_num_workers (int):
+ Number of workers to precompute features. Defaults to 0.
+
  use_noise_augment (bool):
  Augment the input audio with random noise.
 
+ start_by_longest (bool):
+ If True, the data loader will start loading the longest batch first. It is useful for checking OOM issues.
+ Defaults to False.
+
  add_blank (bool):
  Add blank characters between each other two characters. It improves performance for some models at expense
  of slower run-time due to the longer input sequence.
@@ -207,6 +219,7 @@ class BaseTTSConfig(BaseTrainingConfig):
  phoneme_cache_path: str = None
  # vocabulary parameters
  characters: CharactersConfig = None
+ add_blank: bool = False
  # training params
  batch_group_size: int = 0
  loss_masking: bool = None
@@ -218,8 +231,9 @@ class BaseTTSConfig(BaseTrainingConfig):
  max_text_len: int = float("inf")
  compute_f0: bool = False
  compute_linear_spec: bool = False
+ precompute_num_workers: int = 0
  use_noise_augment: bool = False
- add_blank: bool = False
+ start_by_longest: bool = False
  # dataset
  datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
  # optimizer

diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py
@@ -67,15 +67,6 @@ class VitsConfig(BaseTTSConfig):
  compute_linear_spec (bool):
  If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
 
- sort_by_audio_len (bool):
- If true, dataloder sorts the data by audio length else sorts by the input text length. Defaults to `True`.
-
- min_seq_len (int):
- Minimum sequnce length to be considered for training. Defaults to `0`.
-
- max_seq_len (int):
- Maximum sequnce length to be considered for training. Defaults to `500000`.
-
  r (int):
  Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
 
@@ -123,16 +114,14 @@ class VitsConfig(BaseTTSConfig):
  feat_loss_alpha: float = 1.0
  mel_loss_alpha: float = 45.0
  dur_loss_alpha: float = 1.0
+ aligner_loss_alpha = 1.0
  speaker_encoder_loss_alpha: float = 1.0
 
  # data loader params
  return_wav: bool = True
  compute_linear_spec: bool = True
 
  # overrides
- sort_by_audio_len: bool = True
- min_seq_len: int = 0
- max_seq_len: int = 500000
  r: int = 1 # DO NOT CHANGE
  add_blank: bool = True
 

diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
@@ -13,7 +13,7 @@ def split_dataset(items):
  """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
 
  Args:
- items (List[List]): A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
+ items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`.
  """
  speakers = [item[-1] for item in items]
  is_multi_speaker = len(set(speakers)) > 1
@@ -52,7 +52,7 @@ def load_tts_samples(
 
  formatter (Callable, optional): The preprocessing function to be applied to create the list of samples. It
  must take the root_path and the meta_file name and return a list of samples in the format of
- `[[audio_path, text, speaker_id], ...]]`. See the available formatters in `TTS.tts.dataset.formatter` as
+ `[[text, audio_path, speaker_id], ...]]`. See the available formatters in `TTS.tts.dataset.formatter` as
  example. Defaults to None.
 
  Returns: