From c971e065db80d4f9561867ab1d9924e3d9746e15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Nov 2021 13:29:57 +0100
Subject: [PATCH 01/67] Refactor Synthesizer class for TTSTokenizer

---
 TTS/utils/synthesizer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 2e4f4735bc..a06a493fe9 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -114,7 +114,8 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -
 
         self.tts_config = load_config(tts_config_path)
         self.use_phonemes = self.tts_config.use_phonemes
-        self.tts_model = setup_tts_model(config=self.tts_config)
+        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)
+        self.tokenizer = TTSTokenizer.init_from_config(self.tts_config)
 
         speaker_manager = self._init_speaker_manager()
         language_manager = self._init_language_manager()
@@ -332,6 +333,8 @@ def tts(
                 text=sen,
                 CONFIG=self.tts_config,
                 use_cuda=self.use_cuda,
+                ap=self.ap,
+                tokenizer=self.tokenizer,
                 speaker_id=speaker_id,
                 language_id=language_id,
                 language_name=language_name,

From c9142eb47f407bda1e8a4ad8bed83f544fd911ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Nov 2021 13:33:21 +0100
Subject: [PATCH 02/67] Refactor TTSDataset to use TTSTokenizer

---
 TTS/tts/datasets/dataset.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index bd7022e35b..e71cdb67c0 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -69,6 +69,9 @@ def __init__(
 
             samples (list): List of dataset instances.
 
+            tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else
+                use the given. Defaults to None.
+
             tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else
                 use the given. Defaults to None.
 
@@ -202,6 +205,20 @@ def get_token_ids(self, idx, text):
             token_ids = self.tokenizer.text_to_ids(text)
         return np.array(token_ids, dtype=np.int32)
 
+    @staticmethod
+    def _parse_sample(item):
+        language_name = None
+        attn_file = None
+        if len(item) == 5:
+            text, wav_file, speaker_name, language_name, attn_file = item
+        elif len(item) == 4:
+            text, wav_file, speaker_name, language_name = item
+        elif len(item) == 3:
+            text, wav_file, speaker_name = item
+        else:
+            raise ValueError(" [!] Dataset cannot parse the sample.")
+        return text, wav_file, speaker_name, language_name, attn_file
+
     def load_data(self, idx):
         item = self.samples[idx]
 

From da13f46a04f329d70dd73725cae9185f80780026 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Nov 2021 13:34:45 +0100
Subject: [PATCH 03/67] Refactor synthesis.py for TTSTokenizer

---
 TTS/tts/utils/synthesis.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index c2efdcba70..10eb55a65c 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -175,11 +175,8 @@ def synthesis(
     text,
     CONFIG,
     use_cuda,
-<<<<<<< HEAD
-=======
     ap,
     tokenizer,
->>>>>>> Refactor synthesis.py for TTSTokenizer
     speaker_id=None,
     style_wav=None,
     use_griffin_lim=False,

From 2588e8290a32dfbb320bff4907f4c41eece91cb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Nov 2021 13:36:35 +0100
Subject: [PATCH 04/67] Refactor GlowTTS model and recipe for TTSTokenizer

---
 TTS/tts/models/base_tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 272317905b..64086a847d 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -287,7 +287,7 @@ def get_data_loader(
                 verbose=verbose,
                 speaker_id_mapping=speaker_id_mapping,
                 d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
-                tokenizer=self.tokenizer,
+                tokenizer=self.tokenizer
             )
 
             # wait all the DDP process to be ready

From e1db18045c4b593ada230b097cdb15421c7c2c60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 17 Nov 2021 12:46:04 +0100
Subject: [PATCH 05/67]  Update imports for symbols -> characters

---
 TTS/tts/models/base_tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 64086a847d..272317905b 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -287,7 +287,7 @@ def get_data_loader(
                 verbose=verbose,
                 speaker_id_mapping=speaker_id_mapping,
                 d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
-                tokenizer=self.tokenizer
+                tokenizer=self.tokenizer,
             )
 
             # wait all the DDP process to be ready

From 66cad5b5b4f62fd4696498b93719f3336fb61754 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 24 Nov 2021 17:49:20 +0100
Subject: [PATCH 06/67] Update for tokenizer API

---
 TTS/utils/synthesizer.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index a06a493fe9..2e4f4735bc 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -114,8 +114,7 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -
 
         self.tts_config = load_config(tts_config_path)
         self.use_phonemes = self.tts_config.use_phonemes
-        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)
-        self.tokenizer = TTSTokenizer.init_from_config(self.tts_config)
+        self.tts_model = setup_tts_model(config=self.tts_config)
 
         speaker_manager = self._init_speaker_manager()
         language_manager = self._init_language_manager()
@@ -333,8 +332,6 @@ def tts(
                 text=sen,
                 CONFIG=self.tts_config,
                 use_cuda=self.use_cuda,
-                ap=self.ap,
-                tokenizer=self.tokenizer,
                 speaker_id=speaker_id,
                 language_id=language_id,
                 language_name=language_name,

From 4884169264f8c4a9d3d125e7880afa2f093c9058 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 30 Nov 2021 15:50:18 +0100
Subject: [PATCH 07/67] =?UTF-8?q?Refactor=20TTSDataset=20=E2=9A=A1?=
 =?UTF-8?q?=EF=B8=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 TTS/tts/datasets/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index e71cdb67c0..0cf910467f 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -216,8 +216,8 @@ def _parse_sample(item):
         elif len(item) == 3:
             text, wav_file, speaker_name = item
         else:
-            raise ValueError(" [!] Dataset cannot parse the sample.")
-        return text, wav_file, speaker_name, language_name, attn_file
+            token_ids = self.tokenizer.text_to_ids(text)
+        return token_ids
 
     def load_data(self, idx):
         item = self.samples[idx]

From 580b99e43c03307c8d9ec8a825f2c165710bb314 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 30 Nov 2021 15:55:36 +0100
Subject: [PATCH 08/67] Refactorin VITS for the tokenizer API

---
 TTS/tts/models/vits.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index d7059da905..aa578ff8cd 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -272,10 +272,7 @@ class Vits(BaseTTS):
     # pylint: disable=dangerous-default-value
 
     def __init__(
-        self,
-        config: Coqpit,
-        speaker_manager: SpeakerManager = None,
-        language_manager: LanguageManager = None,
+        self, config: Coqpit, ap: "AudioProcessor", tokenizer: "TTSTokenizer", speaker_manager: SpeakerManager = None, language_manager: LanguageManager = None
     ):
 
         super().__init__(config, ap, tokenizer, speaker_manager)

From 7c46d5ec83372074d8dd5a8540ef83da055a551c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 1 Dec 2021 10:06:02 +0100
Subject: [PATCH 09/67] Update data loader tests

---
 TTS/tts/datasets/dataset.py     | 17 -----------------
 tests/data_tests/test_loader.py |  1 -
 2 files changed, 18 deletions(-)

diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 0cf910467f..bd7022e35b 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -69,9 +69,6 @@ def __init__(
 
             samples (list): List of dataset instances.
 
-            tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else
-                use the given. Defaults to None.
-
             tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else
                 use the given. Defaults to None.
 
@@ -205,20 +202,6 @@ def get_token_ids(self, idx, text):
             token_ids = self.tokenizer.text_to_ids(text)
         return np.array(token_ids, dtype=np.int32)
 
-    @staticmethod
-    def _parse_sample(item):
-        language_name = None
-        attn_file = None
-        if len(item) == 5:
-            text, wav_file, speaker_name, language_name, attn_file = item
-        elif len(item) == 4:
-            text, wav_file, speaker_name, language_name = item
-        elif len(item) == 3:
-            text, wav_file, speaker_name = item
-        else:
-            token_ids = self.tokenizer.text_to_ids(text)
-        return token_ids
-
     def load_data(self, idx):
         item = self.samples[idx]
 
diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
index a1d43b8176..ac850a1440 100644
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@@ -39,7 +39,6 @@ def __init__(self, *args, **kwargs):
 
     def _create_dataloader(self, batch_size, r, bgs):
         items = ljspeech(c.data_path, "metadata.csv")
-
         tokenizer = TTSTokenizer.init_from_config(c)
         dataset = TTSDataset(
             outputs_per_step=r,

From 033dedffbc79d7f8961ce40f65a5913bca0d92bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 08:56:57 +0000
Subject: [PATCH 10/67] Add init_from_config

---
 TTS/vocoder/models/base_vocoder.py | 1 +
 TTS/vocoder/models/wavegrad.py     | 4 ++++
 TTS/vocoder/models/wavernn.py      | 4 ++++
 3 files changed, 9 insertions(+)

diff --git a/TTS/vocoder/models/base_vocoder.py b/TTS/vocoder/models/base_vocoder.py
index 9d6ef26f6f..2728525cc5 100644
--- a/TTS/vocoder/models/base_vocoder.py
+++ b/TTS/vocoder/models/base_vocoder.py
@@ -20,6 +20,7 @@ class BaseVocoder(BaseModel):
 
     def __init__(self, config):
         super().__init__(config)
+        self._set_model_args(config)
 
     def _set_model_args(self, config: Coqpit):
         """Setup model args based on the config type.
diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py
index ed4f4b37b4..f801715df7 100644
--- a/TTS/vocoder/models/wavegrad.py
+++ b/TTS/vocoder/models/wavegrad.py
@@ -339,3 +339,7 @@ def on_epoch_start(self, trainer):  # pylint: disable=unused-argument
         noise_schedule = self.config["train_noise_schedule"]
         betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"])
         self.compute_noise_level(betas)
+
+    @staticmethod
+    def init_from_config(config: "WavegradConfig"):
+        return Wavegrad(config)
diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py
index 1977efb687..5ce01782ef 100644
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@@ -631,3 +631,7 @@ def get_data_loader(  # pylint: disable=no-self-use
     def get_criterion(self):
         # define train functions
         return WaveRNNLoss(self.args.mode)
+
+    @staticmethod
+    def init_from_config(config: "WavernnConfig"):
+        return Wavernn(config)

From 29ff0f6a376670073680fc7293496f9218873e4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:51:58 +0000
Subject: [PATCH 11/67] Make lint

---
 TTS/tts/datasets/__init__.py                  |   4 +-
 TTS/tts/datasets/dataset.py                   | 177 ++++++++--------
 TTS/tts/utils/synthesis.py                    |   4 +-
 TTS/tts/utils/text/characters.py              | 199 +++++++++++-------
 TTS/tts/utils/text/phonemizers/base.py        |  36 ++--
 .../utils/text/phonemizers/espeak_wrapper.py  |  65 ++++--
 .../utils/text/phonemizers/gruut_wrapper.py   |   3 +-
 .../text/phonemizers/ja_jp_phonemizer.py      |  20 +-
 .../text/phonemizers/multi_phonemizer.py      |  28 +--
 .../text/phonemizers/zh_cn_phonemizer.py      |  23 +-
 TTS/tts/utils/text/punctuation.py             |  18 +-
 TTS/utils/audio.py                            |   3 +-
 TTS/utils/synthesizer.py                      |   1 -
 TTS/vocoder/models/gan.py                     |   2 +-
 14 files changed, 333 insertions(+), 250 deletions(-)

diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index 40eed7e365..4e8a2485db 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -88,8 +88,8 @@ def load_tts_samples(
             meta_data_eval_all += meta_data_eval
         meta_data_train_all += meta_data_train
         # load attention masks for the duration predictor training
-        if dataset.meta_file_attn_mask:
-            meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
+        if d.meta_file_attn_mask:
+            meta_data = dict(load_attention_mask_meta_data(d["meta_file_attn_mask"]))
             for idx, ins in enumerate(meta_data_train_all):
                 attn_file = meta_data[ins[1]].strip()
                 meta_data_train_all[idx].append(attn_file)
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index bd7022e35b..229f59c7a0 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -1,7 +1,6 @@
 import collections
 import os
 import random
-from multiprocessing import Pool
 from typing import Dict, List, Union
 
 import numpy as np
@@ -10,7 +9,6 @@
 from torch.utils.data import Dataset
 
 from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
-from TTS.tts.utils.text import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 
@@ -183,7 +181,7 @@ def load_wav(self, filename):
     def get_phonemes(self, idx, text):
         out_dict = self.phoneme_dataset[idx]
         assert text == out_dict["text"], f"{text} != {out_dict['text']}"
-        assert out_dict["token_ids"].size > 0
+        assert len(out_dict["token_ids"]) > 0
         return out_dict
 
     def get_f0(self, idx):
@@ -192,7 +190,8 @@ def get_f0(self, idx):
         assert wav_file == out_dict["audio_file"]
         return out_dict
 
-    def get_attn_maks(self, attn_file):
+    @staticmethod
+    def get_attn_mask(attn_file):
         return np.load(attn_file)
 
     def get_token_ids(self, idx, text):
@@ -205,7 +204,7 @@ def get_token_ids(self, idx, text):
     def load_data(self, idx):
         item = self.samples[idx]
 
-        text, wav_file, speaker_name, language_name, attn_file = _parse_sample(item)
+        text, wav_file, speaker_name, _, attn_file = _parse_sample(item)
         raw_text = text
 
         wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
@@ -263,7 +262,7 @@ def filter_by_length(lengths: List[int], min_len: int, max_len: int):
         idxs = np.argsort(lengths)  # ascending order
         ignore_idx = []
         keep_idx = []
-        for i, idx in enumerate(idxs):
+        for idx in idxs:
             length = lengths[idx]
             if length < min_len or length > max_len:
                 ignore_idx.append(idx)
@@ -278,6 +277,7 @@ def sort_by_length(lengths: List[int]):
 
     @staticmethod
     def create_buckets(samples, batch_group_size: int):
+        assert batch_group_size > 0
         for i in range(len(samples) // batch_group_size):
             offset = i * batch_group_size
             end_offset = offset + batch_group_size
@@ -320,7 +320,8 @@ def preprocess_samples(self):
         # shuffle batch groups
         # create batches with similar length items
         # the larger the `batch_group_size`, the higher the length variety in a batch.
-        samples = self.create_buckets(samples, self.batch_group_size)
+        if self.batch_group_size > 0:
+            samples = self.create_buckets(samples, self.batch_group_size)
 
         # update items to the new sorted items
         self.samples = samples
@@ -572,6 +573,7 @@ def precompute(self, num_workers=1):
 
         We use pytorch dataloader because we are lazy.
         """
+        print("[*] Pre-computing phonemes...")
         with tqdm.tqdm(total=len(self)) as pbar:
             batch_size = num_workers if num_workers > 0 else 1
             dataloder = torch.utils.data.DataLoader(
@@ -659,16 +661,21 @@ def __len__(self):
         return len(self.samples)
 
     def precompute(self, num_workers=0):
+        print("[*] Pre-computing F0s...")
         with tqdm.tqdm(total=len(self)) as pbar:
             batch_size = num_workers if num_workers > 0 else 1
+            # we do not normalize at preproessing
+            normalize_f0 = self.normalize_f0
+            self.normalize_f0 = False
             dataloder = torch.utils.data.DataLoader(
                 batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
             )
             computed_data = []
             for batch in dataloder:
                 f0 = batch["f0"]
-                computed_data.append([f for f in f0])
+                computed_data.append(f for f in f0)
                 pbar.update(batch_size)
+            self.normalize_f0 = normalize_f0
 
         if self.normalize_f0:
             computed_data = [tensor for batch in computed_data for tensor in batch]  # flatten
@@ -747,80 +754,80 @@ def print_logs(self, level: int = 0) -> None:
         print(f"{indent}| > Number of instances : {len(self.samples)}")
 
 
-if __name__ == "__main__":
-    from torch.utils.data import DataLoader
-
-    from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
-    from TTS.tts.datasets import load_tts_samples
-    from TTS.tts.utils.text.characters import IPAPhonemes
-    from TTS.tts.utils.text.phonemizers import ESpeak
-
-    dataset_config = BaseDatasetConfig(
-        name="ljspeech",
-        meta_file_train="metadata.csv",
-        path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1",
-    )
-    train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
-    samples = train_samples + eval_samples
-
-    phonemizer = ESpeak(language="en-us")
-    tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer)
-    # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests")
-    # ph_dataset.precompute(num_workers=4)
-
-    # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn)
-    # for batch in dataloader:
-    #     print(batch)
-    #     break
-
-    audio_config = BaseAudioConfig(
-        sample_rate=22050,
-        win_length=1024,
-        hop_length=256,
-        num_mels=80,
-        preemphasis=0.0,
-        ref_level_db=20,
-        log_func="np.log",
-        do_trim_silence=True,
-        trim_db=45,
-        mel_fmin=0,
-        mel_fmax=8000,
-        spec_gain=1.0,
-        signal_norm=False,
-        do_amp_to_db_linear=False,
-    )
-
-    ap = AudioProcessor.init_from_config(audio_config)
-
-    # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4)
-
-    # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn)
-    # for batch in dataloader:
-    #     print(batch)
-    #     breakpoint()
-    #     break
-
-    dataset = TTSDataset(
-        outputs_per_step=1,
-        compute_linear_spec=False,
-        samples=samples,
-        ap=ap,
-        return_wav=False,
-        batch_group_size=0,
-        min_seq_len=0,
-        max_seq_len=500,
-        use_noise_augment=False,
-        verbose=True,
-        speaker_id_mapping=None,
-        d_vector_mapping=None,
-        compute_f0=True,
-        f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests",
-        tokenizer=tokenizer,
-        phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests",
-        precompute_num_workers=4,
-    )
-
-    dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
-    for batch in dataloader:
-        print(batch)
-        break
+# if __name__ == "__main__":
+#     from torch.utils.data import DataLoader
+
+#     from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
+#     from TTS.tts.datasets import load_tts_samples
+#     from TTS.tts.utils.text.characters import IPAPhonemes
+#     from TTS.tts.utils.text.phonemizers import ESpeak
+
+#     dataset_config = BaseDatasetConfig(
+#         name="ljspeech",
+#         meta_file_train="metadata.csv",
+#         path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1",
+#     )
+#     train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+#     samples = train_samples + eval_samples
+
+#     phonemizer = ESpeak(language="en-us")
+#     tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer)
+#     # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests")
+#     # ph_dataset.precompute(num_workers=4)
+
+#     # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn)
+#     # for batch in dataloader:
+#     #     print(batch)
+#     #     break
+
+#     audio_config = BaseAudioConfig(
+#         sample_rate=22050,
+#         win_length=1024,
+#         hop_length=256,
+#         num_mels=80,
+#         preemphasis=0.0,
+#         ref_level_db=20,
+#         log_func="np.log",
+#         do_trim_silence=True,
+#         trim_db=45,
+#         mel_fmin=0,
+#         mel_fmax=8000,
+#         spec_gain=1.0,
+#         signal_norm=False,
+#         do_amp_to_db_linear=False,
+#     )
+
+#     ap = AudioProcessor.init_from_config(audio_config)
+
+#     # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4)
+
+#     # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn)
+#     # for batch in dataloader:
+#     #     print(batch)
+#     #     breakpoint()
+#     #     break
+
+#     dataset = TTSDataset(
+#         outputs_per_step=1,
+#         compute_linear_spec=False,
+#         samples=samples,
+#         ap=ap,
+#         return_wav=False,
+#         batch_group_size=0,
+#         min_seq_len=0,
+#         max_seq_len=500,
+#         use_noise_augment=False,
+#         verbose=True,
+#         speaker_id_mapping=None,
+#         d_vector_mapping=None,
+#         compute_f0=True,
+#         f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests",
+#         tokenizer=tokenizer,
+#         phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests",
+#         precompute_num_workers=4,
+#     )
+
+#     dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
+#     for batch in dataloader:
+#         print(batch)
+#         break
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 10eb55a65c..47ea0e934c 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -283,10 +283,10 @@ def synthesis(
         wav = model_outputs.squeeze(0)
     else:
         if use_griffin_lim:
-            wav = inv_spectrogram(model_outputs, ap, CONFIG)
+            wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
             # trim silence
             if do_trim_silence:
-                wav = trim_silence(wav, ap)
+                wav = trim_silence(wav, model.ap)
     return_dict = {
         "wav": wav,
         "alignments": alignments,
diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py
index 24ce51f1a8..aae6844ffa 100644
--- a/TTS/tts/utils/text/characters.py
+++ b/TTS/tts/utils/text/characters.py
@@ -1,3 +1,8 @@
+from dataclasses import replace
+
+from TTS.tts.configs.shared_configs import CharactersConfig
+
+
 def parse_symbols():
     return {
         "pad": _pad,
@@ -29,46 +34,49 @@ def parse_symbols():
 _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
 
 
-def create_graphemes(
-    characters=_characters,
-    punctuations=_punctuations,
-    pad=_pad,
-    eos=_eos,
-    bos=_bos,
-    blank=_blank,
-    unique=True,
-):  # pylint: disable=redefined-outer-name
-    """Function to create default characters and phonemes"""
-    # create graphemes
-    _graphemes = list(characters)
-    _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes
-    _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes
-    _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes
-    _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes
-    _graphemes = _graphemes + list(punctuations)
-    return _graphemes, _phonemes
-
-
-def create_phonemes(
-    phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True
-):
-    # create phonemes
-    _phonemes = None
-    _phonemes_sorted = (
-        sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
-    )  # this is to keep previous models compatible.
-    _phonemes = list(_phonemes_sorted)
-    _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes
-    _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes
-    _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes
-    _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes
-    _phonemes = _phonemes + list(punctuations)
-    _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
-    return _phonemes
-
-
-graphemes = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
-phonemes = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank)
+# def create_graphemes(
+#     characters=_characters,
+#     punctuations=_punctuations,
+#     pad=_pad,
+#     eos=_eos,
+#     bos=_bos,
+#     blank=_blank,
+#     unique=True,
+# ):  # pylint: disable=redefined-outer-name
+#     """Function to create default characters and phonemes"""
+#     # create graphemes
+#     = (
+#         sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
+#     )  # this is to keep previous models compatible.
+#     _graphemes = list(characters)
+#     _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes
+#     _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes
+#     _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes
+#     _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes
+#     _graphemes = _graphemes + list(punctuations)
+#     return _graphemes, _phonemes
+
+
+# def create_phonemes(
+#     phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True
+# ):
+#     # create phonemes
+#     _phonemes = None
+#     _phonemes_sorted = (
+#         sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
+#     )  # this is to keep previous models compatible.
+#     _phonemes = list(_phonemes_sorted)
+#     _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes
+#     _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes
+#     _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes
+#     _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes
+#     _phonemes = _phonemes + list(punctuations)
+#     _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
+#     return _phonemes
+
+
+# DEF_GRAPHEMES = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
+# DEF_PHONEMES = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank)
 
 
 class BaseCharacters:
@@ -114,7 +122,7 @@ def __init__(
         eos: str,
         bos: str,
         blank: str,
-        is_unique: bool = True,
+        is_unique: bool = False,
         is_sorted: bool = True,
     ) -> None:
         self._characters = characters
@@ -202,14 +210,20 @@ def _create_vocab(self):
         _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
         self._vocab = _vocab + list(self._punctuations)
         self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
-        self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
+        self._id_to_char = {
+            idx: char for idx, char in enumerate(self.vocab)  # pylint: disable=unnecessary-comprehension
+        }
         if self.is_unique:
+            duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
             assert (
                 len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
-            ), f" [!] There are duplicate characters in the character set. {set([x for x in self.vocab if self.vocab.count(x) > 1])}"
+            ), f" [!] There are duplicate characters in the character set. {duplicates}"
 
     def char_to_id(self, char: str) -> int:
-        return self._char_to_id[char]
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
 
     def id_to_char(self, idx: int) -> str:
         return self._id_to_char[idx]
@@ -229,9 +243,23 @@ def print_log(self, level: int = 0):
         print(f"{indent}| > Num chars: {self.num_chars}")
 
     @staticmethod
-    def init_from_config(config: "Coqpit"):
-        return BaseCharacters(
-            **config.characters if config.characters is not None else {},
+    def init_from_config(config: "Coqpit"):  # pylint: disable=unused-argument
+        """Init your character class from a config.
+
+        Implement this method for your subclass.
+        """
+        ...
+
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=self._eos,
+            bos=self._bos,
+            blank=self._blank,
+            is_unique=self.is_unique,
+            is_sorted=self.is_sorted,
         )
 
 
@@ -275,31 +303,42 @@ def __init__(
         eos: str = _eos,
         bos: str = _bos,
         blank: str = _blank,
-        is_unique: bool = True,
+        is_unique: bool = False,
         is_sorted: bool = True,
     ) -> None:
         super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
 
     @staticmethod
     def init_from_config(config: "Coqpit"):
+        """Init a IPAPhonemes object from a model config
+
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
         # band-aid for compatibility with old models
         if "characters" in config and config.characters is not None:
             if "phonemes" in config.characters and config.characters.phonemes is not None:
                 config.characters["characters"] = config.characters["phonemes"]
-            return IPAPhonemes(
-                characters=config.characters["characters"],
-                punctuations=config.characters["punctuations"],
-                pad=config.characters["pad"],
-                eos=config.characters["eos"],
-                bos=config.characters["bos"],
-                blank=config.characters["blank"],
-                is_unique=config.characters["is_unique"],
-                is_sorted=config.characters["is_sorted"],
-            )
-        else:
-            return IPAPhonemes(
-                **config.characters if config.characters is not None else {},
+            return (
+                IPAPhonemes(
+                    characters=config.characters["characters"],
+                    punctuations=config.characters["punctuations"],
+                    pad=config.characters["pad"],
+                    eos=config.characters["eos"],
+                    bos=config.characters["bos"],
+                    blank=config.characters["blank"],
+                    is_unique=config.characters["is_unique"],
+                    is_sorted=config.characters["is_sorted"],
+                ),
+                config,
             )
+        # use character set from config
+        if config.characters is not None:
+            return IPAPhonemes(**config.characters), config
+        # return default character set
+        characters = IPAPhonemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
 
 
 class Graphemes(BaseCharacters):
@@ -339,24 +378,42 @@ def __init__(
         eos: str = _eos,
         bos: str = _bos,
         blank: str = _blank,
-        is_unique: bool = True,
+        is_unique: bool = False,
         is_sorted: bool = True,
     ) -> None:
         super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
 
     @staticmethod
     def init_from_config(config: "Coqpit"):
-        return Graphemes(
-            **config.characters if config.characters is not None else {},
-        )
+        """Init a Graphemes object from a model config
+
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        if config.characters is not None:
+            # band-aid for compatibility with old models
+            if "phonemes" in config.characters:
+                return (
+                    Graphemes(
+                        characters=config.characters["characters"],
+                        punctuations=config.characters["punctuations"],
+                        pad=config.characters["pad"],
+                        eos=config.characters["eos"],
+                        bos=config.characters["bos"],
+                        blank=config.characters["blank"],
+                        is_unique=config.characters["is_unique"],
+                        is_sorted=config.characters["is_sorted"],
+                    ),
+                    config,
+                )
+            return Graphemes(**config.characters), config
+        characters = Graphemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
 
 
 if __name__ == "__main__":
     gr = Graphemes()
     ph = IPAPhonemes()
-
-    print(gr.vocab)
-    print(ph.vocab)
-
-    print(gr.num_chars)
-    assert "a" == gr.id_to_char(gr.char_to_id("a"))
+    gr.print_log()
+    ph.print_log()
diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py
index 249c8bce83..08fa8e130a 100644
--- a/TTS/tts/utils/text/phonemizers/base.py
+++ b/TTS/tts/utils/text/phonemizers/base.py
@@ -1,6 +1,5 @@
 import abc
-import itertools
-from typing import List, Tuple, Union
+from typing import List, Tuple
 
 from TTS.tts.utils.text.punctuation import Punctuation
 
@@ -8,6 +7,19 @@
 class BasePhonemizer(abc.ABC):
     """Base phonemizer class
 
+    Phonemization follows the following steps:
+        1. Preprocessing:
+            - remove empty lines
+            - remove punctuation
+            - keep track of punctuation marks
+
+        2. Phonemization:
+            - convert text to phonemes
+
+        3. Postprocessing:
+            - join phonemes
+            - restore punctuation marks
+
     Args:
         language (str):
             Language used by the phonemizer.
@@ -51,40 +63,30 @@ def language(self):
     @abc.abstractmethod
     def name():
         """The name of the backend"""
+        ...
 
     @classmethod
     @abc.abstractmethod
     def is_available(cls):
         """Returns True if the backend is installed, False otherwise"""
+        ...
 
     @classmethod
     @abc.abstractmethod
     def version(cls):
         """Return the backend version as a tuple (major, minor, patch)"""
+        ...
 
+    @staticmethod
     @abc.abstractmethod
     def supported_languages():
         """Return a dict of language codes -> name supported by the backend"""
+        ...
 
     def is_supported_language(self, language):
         """Returns True if `language` is supported by the backend"""
         return language in self.supported_languages()
 
-    fr"""
-        Phonemization follows the following steps:
-            1. Preprocessing:
-                - remove empty lines
-                - remove punctuation
-                - keep track of punctuation marks
-
-            2. Phonemization:
-                - convert text to phonemes
-
-            3. Postprocessing:
-                - join phonemes
-                - restore punctuation marks
-    """
-
     @abc.abstractmethod
     def _phonemize(self, text, separator):
         """The main phonemization method"""
diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
index f1d0b6cd2b..3cccee41d6 100644
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@@ -28,29 +28,30 @@ def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]:
         "1",  # UTF8 text encoding
     ]
     cmd.extend(args)
-    logging.debug("espeakng: executing %s" % repr(cmd))
-    p = subprocess.Popen(
+    logging.debug("espeakng: executing %s", repr(cmd))
+
+    with subprocess.Popen(
         cmd,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
-    )
-    res = iter(p.stdout.readline, b"")
-    if not sync:
+    ) as p:
+        res = iter(p.stdout.readline, b"")
+        if not sync:
+            p.stdout.close()
+            if p.stderr:
+                p.stderr.close()
+            if p.stdin:
+                p.stdin.close()
+            return res
+        res2 = []
+        for line in res:
+            res2.append(line)
         p.stdout.close()
         if p.stderr:
             p.stderr.close()
         if p.stdin:
             p.stdin.close()
-        return res
-    res2 = []
-    for line in res:
-        res2.append(line)
-    p.stdout.close()
-    if p.stderr:
-        p.stderr.close()
-    if p.stdin:
-        p.stdin.close()
-    p.wait()
+        p.wait()
     return res2
 
 
@@ -85,7 +86,24 @@ class ESpeak(BasePhonemizer):
     def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True):
         if self._ESPEAK_LIB is None:
             raise Exception("Unknown backend: %s" % backend)
+
+        # band-aid for backwards compatibility
+        if language == "en":
+            language = "en-us"
+
         super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
+        if backend is not None:
+            self.backend = backend
+
+    @property
+    def backend(self):
+        return self._ESPEAK_LIB
+
+    @backend.setter
+    def backend(self, backend):
+        if backend not in ["espeak", "espeak-ng"]:
+            raise Exception("Unknown backend: %s" % backend)
+        self._ESPEAK_LIB = backend
 
     def auto_set_espeak_lib(self) -> None:
         if is_tool("espeak-ng"):
@@ -115,24 +133,25 @@ def phonemize_espeak(self, text: str, separator: str = "|", tie=False) -> str:
         # espeak and espeak-ng parses `ipa` differently
         if tie:
             # use '͡' between phonemes
-            if _DEF_ESPEAK_LIB == "espeak":
+            if self.backend == "espeak":
                 args.append("--ipa=1")
             else:
                 args.append("--ipa=3")
         else:
             # split with '_'
-            if _DEF_ESPEAK_LIB == "espeak":
+            if self.backend == "espeak":
                 args.append("--ipa=3")
             else:
                 args.append("--ipa=1")
         if tie:
             args.append("--tie=%s" % tie)
+
         args.append('"' + text + '"')
         # compute phonemes
         phonemes = ""
         for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
-            logging.debug("line: %s" % repr(line))
-            phonemes += line.decode("utf8").strip()
+            logging.debug("line: %s", repr(line))
+            phonemes += line.decode("utf8").strip()[2:]  # skip two redundant characters
         return phonemes.replace("_", separator)
 
     def _phonemize(self, text, separator=None):
@@ -146,7 +165,7 @@ def supported_languages() -> Dict:
             Dict: Dictionary of language codes.
         """
         if _DEF_ESPEAK_LIB is None:
-            raise {}
+            return {}
         args = ["--voices"]
         langs = {}
         count = 0
@@ -157,7 +176,7 @@ def supported_languages() -> Dict:
                 lang_code = cols[1]
                 lang_name = cols[3]
                 langs[lang_code] = lang_name
-            logging.debug("line: %s" % repr(line))
+            logging.debug("line: %s", repr(line))
             count += 1
         return langs
 
@@ -168,9 +187,9 @@ def version(self) -> str:
             str: Version of the used backend.
         """
         args = ["--version"]
-        for line in _espeak_exe(_DEF_ESPEAK_LIB, args, sync=True):
+        for line in _espeak_exe(self.backend, args, sync=True):
             version = line.decode("utf8").strip().split()[2]
-            logging.debug("line: %s" % repr(line))
+            logging.debug("line: %s", repr(line))
             return version
 
     @classmethod
diff --git a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
index d0aa469e26..f3e9c9abd4 100644
--- a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
@@ -1,5 +1,4 @@
 import importlib
-from os import stat
 from typing import List
 
 import gruut
@@ -55,7 +54,7 @@ def __init__(
     def name():
         return "gruut"
 
-    def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str:
+    def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str:  # pylint: disable=unused-argument
         """Convert input text to phonemes.
 
         Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
diff --git a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
index 4f93edeb6b..60b965f9d8 100644
--- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
@@ -30,7 +30,7 @@ class JA_JP_Phonemizer(BasePhonemizer):
 
     language = "ja-jp"
 
-    def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs):
+    def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs):  # pylint: disable=unused-argument
         super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
 
     @staticmethod
@@ -61,12 +61,12 @@ def is_available(self) -> bool:
         return True
 
 
-if __name__ == "__main__":
-    text = "これは、電話をかけるための私の日本語の例のテキストです。"
-    e = JA_JP_Phonemizer()
-    print(e.supported_languages())
-    print(e.version())
-    print(e.language)
-    print(e.name())
-    print(e.is_available())
-    print("`" + e.phonemize(text) + "`")
+# if __name__ == "__main__":
+#     text = "これは、電話をかけるための私の日本語の例のテキストです。"
+#     e = JA_JP_Phonemizer()
+#     print(e.supported_languages())
+#     print(e.version())
+#     print(e.language)
+#     print(e.name())
+#     print(e.is_available())
+#     print("`" + e.phonemize(text) + "`")
diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
index e8b2ce347e..e36b0a2a1f 100644
--- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
@@ -17,7 +17,7 @@ class MultiPhonemizer:
     lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER
     language = "multi-lingual"
 
-    def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None:
+    def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None:  # pylint: disable=dangerous-default-value
         self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer)
         self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
 
@@ -40,16 +40,16 @@ def supported_languages(self) -> List:
         return list(self.lang_to_phonemizer_name.keys())
 
 
-if __name__ == "__main__":
-    texts = {
-        "tr": "Merhaba, bu Türkçe bit örnek!",
-        "en-us": "Hello, this is English example!",
-        "de": "Hallo, das ist ein Deutches Beipiel!",
-        "zh-cn": "这是中国的例子",
-    }
-    phonemes = {}
-    ph = MultiPhonemizer()
-    for lang, text in texts.items():
-        phoneme = ph.phonemize(text, lang)
-        phonemes[lang] = phoneme
-    print(phonemes)
+# if __name__ == "__main__":
+#     texts = {
+#         "tr": "Merhaba, bu Türkçe bit örnek!",
+#         "en-us": "Hello, this is English example!",
+#         "de": "Hallo, das ist ein Deutches Beipiel!",
+#         "zh-cn": "这是中国的例子",
+#     }
+#     phonemes = {}
+#     ph = MultiPhonemizer()
+#     for lang, text in texts.items():
+#         phoneme = ph.phonemize(text, lang)
+#         phonemes[lang] = phoneme
+#     print(phonemes)
diff --git a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
index e1bd77c7d8..5a4a55911d 100644
--- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
@@ -25,14 +25,15 @@ class ZH_CN_Phonemizer(BasePhonemizer):
 
     language = "zh-cn"
 
-    def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs):
+    def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs):  # pylint: disable=unused-argument
         super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
 
     @staticmethod
     def name():
         return "zh_cn_phonemizer"
 
-    def phonemize_zh_cn(self, text: str, separator: str = "|") -> str:
+    @staticmethod
+    def phonemize_zh_cn(text: str, separator: str = "|") -> str:
         ph = chinese_text_to_phonemes(text, separator)
         return ph
 
@@ -50,12 +51,12 @@ def is_available(self) -> bool:
         return True
 
 
-if __name__ == "__main__":
-    text = "这是，样本中文。"
-    e = ZH_CN_Phonemizer()
-    print(e.supported_languages())
-    print(e.version())
-    print(e.language)
-    print(e.name())
-    print(e.is_available())
-    print("`" + e.phonemize(text) + "`")
+# if __name__ == "__main__":
+#     text = "这是，样本中文。"
+#     e = ZH_CN_Phonemizer()
+#     print(e.supported_languages())
+#     print(e.version())
+#     print(e.language)
+#     print(e.name())
+#     print(e.is_available())
+#     print("`" + e.phonemize(text) + "`")
diff --git a/TTS/tts/utils/text/punctuation.py b/TTS/tts/utils/text/punctuation.py
index 414ac25366..09087d5fcf 100644
--- a/TTS/tts/utils/text/punctuation.py
+++ b/TTS/tts/utils/text/punctuation.py
@@ -130,7 +130,7 @@ def restore(cls, text, puncs):
         return cls._restore(text, puncs, 0)
 
     @classmethod
-    def _restore(cls, text, puncs, num):
+    def _restore(cls, text, puncs, num):  # pylint: disable=too-many-return-statements
         """Auxiliary method for Punctuation.restore()"""
         if not puncs:
             return text
@@ -159,14 +159,14 @@ def _restore(cls, text, puncs, num):
         return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
 
 
-if __name__ == "__main__":
-    punc = Punctuation()
-    text = "This is. This is, example!"
+# if __name__ == "__main__":
+#     punc = Punctuation()
+#     text = "This is. This is, example!"
 
-    print(punc.strip(text))
+#     print(punc.strip(text))
 
-    split_text, puncs = punc.strip_to_restore(text)
-    print(split_text, " ---- ", puncs)
+#     split_text, puncs = punc.strip_to_restore(text)
+#     print(split_text, " ---- ", puncs)
 
-    restored_text = punc.restore(split_text, puncs)
-    print(restored_text)
+#     restored_text = punc.restore(split_text, puncs)
+#     print(restored_text)
diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py
index 9c6bf4541e..55ce49b508 100644
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@@ -383,8 +383,7 @@ def __init__(
     def init_from_config(config: "Coqpit"):
         if "audio" in config:
             return AudioProcessor(**config.audio)
-        else:
-            return AudioProcessor(**config)
+        return AudioProcessor(**config)
 
     ### setting up the parameters ###
     def _build_mel_basis(
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 2e4f4735bc..f6a1ae6ab1 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -13,7 +13,6 @@
 # pylint: disable=unused-wildcard-import
 # pylint: disable=wildcard-import
 from TTS.tts.utils.synthesis import synthesis, trim_silence
-from TTS.tts.utils.text import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.models import setup_model as setup_vocoder_model
 from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index e56d1db493..f78d69b86e 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -314,7 +314,7 @@ def get_data_loader(  # pylint: disable=no-self-use
         data_items: List,
         verbose: bool,
         num_gpus: int,
-        rank: int = 0,  # pylint: disable=unused-argument
+        rank: int = None,  # pylint: disable=unused-argument
     ):
         """Initiate and return the GAN dataloader.
 

From 49fef8db61f9fd59104b8887df92b481aef3698c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:52:45 +0000
Subject: [PATCH 12/67] Allow None pad and blank tokens

---
 TTS/tts/utils/text/tokenizer.py | 48 ++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
index 68a1c57548..3f416bbb86 100644
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@@ -57,8 +57,8 @@ def characters(self):
     @characters.setter
     def characters(self, new_characters):
         self._characters = new_characters
-        self.pad_id = self.characters.char_to_id(self.characters.pad)
-        self.blank_id = self.characters.char_to_id(self.characters.blank)
+        self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
+        self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
 
     def encode(self, text: str) -> List[int]:
         """Encodes a string of text as a sequence of IDs."""
@@ -82,7 +82,7 @@ def decode(self, token_ids: List[int]) -> str:
             text += self.characters.id_to_char(token_id)
         return text
 
-    def text_to_ids(self, text: str, language: str = None) -> List[int]:
+    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
         """Converts a string of text to a sequence of token IDs.
 
         Args:
@@ -137,32 +137,50 @@ def print_logs(self, level: int = 0):
                 print(f"{indent}| > {char}")
 
     @staticmethod
-    def init_from_config(config: "Coqpit"):
+    def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
         """Init Tokenizer object from config
 
         Args:
             config (Coqpit): Coqpit model config.
+            characters (BaseCharacters): Defines the model character set. If not set, use the default options based on
+                the config values. Defaults to None.
         """
         # init cleaners
         if isinstance(config.text_cleaner, (str, list)):
             text_cleaner = getattr(cleaners, config.text_cleaner)
 
+        # init characters
+        if characters is None:
+            if config.use_phonemes:
+                # init phoneme set
+                characters, new_config = IPAPhonemes().init_from_config(config)
+            else:
+                # init character set
+                characters, new_config = Graphemes().init_from_config(config)
+        else:
+            characters, new_config = characters.init_from_config(config)
+
+        # init phonemizer
         phonemizer = None
         if config.use_phonemes:
-            # init phoneme set
-            characters = IPAPhonemes().init_from_config(config)
             phonemizer_kwargs = {"language": config.phoneme_language}
 
-            # init phonemizer
             if "phonemizer" in config and config.phonemizer:
                 phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
             else:
-                phonemizer = get_phonemizer_by_name(
-                    DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
-                )
-        else:
-            # init character set
-            characters = Graphemes().init_from_config(config)
-        return TTSTokenizer(
-            config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars
+                try:
+                    phonemizer = get_phonemizer_by_name(
+                        DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
+                    )
+                except KeyError as e:
+                    raise ValueError(
+                        f"""No phonemizer found for language {config.phoneme_language}.
+                        You may need to install a third party library for this language."""
+                    ) from e
+
+        return (
+            TTSTokenizer(
+                config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars
+            ),
+            new_config,
         )

From 7deadfea4a9edb3d358e6f987b3c1e12fc960d0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:53:25 +0000
Subject: [PATCH 13/67] Use the same phonemizer for `en` to `en-us`

---
 TTS/tts/utils/text/phonemizers/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py
index 0da5875ea7..5dc117c469 100644
--- a/TTS/tts/utils/text/phonemizers/__init__.py
+++ b/TTS/tts/utils/text/phonemizers/__init__.py
@@ -29,6 +29,8 @@
 _new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
 DEF_LANG_TO_PHONEMIZER.update(_new_dict)
 
+DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
+
 
 def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
     """Initiate a phonemizer by name

From 83b9fda57c292487b10496eeba7d85bed17ad701 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:54:39 +0000
Subject: [PATCH 14/67] Pass samples to init_from_config in SpeakerManager

---
 TTS/tts/utils/speakers.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 5e46354bbc..9d2e6fe30c 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -319,23 +319,27 @@ def plot_embeddings(self):
         raise NotImplementedError
 
     @staticmethod
-    def init_from_config(config: "Coqpit"):
+    def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
         """Initialize a speaker manager from config
 
         Args:
             config (Coqpit): Config object.
+            samples (Union[List[List], List[Dict]], optional): List of data samples to parse out the speaker names.
+                Defaults to None.
 
         Returns:
             SpeakerEncoder: Speaker encoder object.
         """
         speaker_manager = None
-        if hasattr(config, "use_speaker_embedding") and config.use_speaker_embedding is True:
+        if hasattr(config, "use_speaker_embedding") and config.use_speaker_embedding:
+            if samples:
+                speaker_manager = SpeakerManager(data_items=samples)
             if config.get("speaker_file", None):
                 speaker_manager = SpeakerManager(speaker_id_file_path=config.speaker_file)
             if config.get("speakers_file", None):
                 speaker_manager = SpeakerManager(speaker_id_file_path=config.speakers_file)
 
-        if hasattr(config, "use_d_vector_file") and config.use_speaker_embedding is True:
+        if hasattr(config, "use_d_vector_file") and config.use_d_vector_file:
             if config.get("speakers_file", None):
                 speaker_manager = SpeakerManager(d_vectors_file_path=config.speaker_file)
             if config.get("d_vector_file", None):

From ae96243e24d4412261e33cb46ac92e48d484b4bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:55:18 +0000
Subject: [PATCH 15/67] Update VITS for the new API

---
 TTS/tts/models/vits.py | 210 ++++++++++++++++++++---------------------
 1 file changed, 105 insertions(+), 105 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index aa578ff8cd..957994f989 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1,7 +1,8 @@
 import math
-from dataclasses import dataclass, field
+import random
+from dataclasses import dataclass, field, replace
 from itertools import chain
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Union
 
 import torch
 
@@ -11,6 +12,7 @@
 from torch.cuda.amp.autocast_mode import autocast
 from torch.nn import functional as F
 
+from TTS.tts.configs.shared_configs import CharactersConfig
 from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
 from TTS.tts.layers.vits.discriminator import VitsDiscriminator
 from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
@@ -20,6 +22,7 @@
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.synthesis import synthesis
+from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment
 from TTS.utils.trainer_utils import get_optimizer, get_scheduler
@@ -280,91 +283,78 @@ def __init__(
         self.END2END = True
         self.speaker_manager = speaker_manager
         self.language_manager = language_manager
-        if config.__class__.__name__ == "VitsConfig":
-            # loading from VitsConfig
-            self.num_chars = self.tokenizer.characters.num_chars
-            self.config = config
-            args = self.config.model_args
-        elif isinstance(config, VitsArgs):
-            # loading from VitsArgs
-            self.config = config
-            args = config
-        else:
-            raise ValueError("config must be either a VitsConfig or VitsArgs")
 
         self.args = args
 
         self.init_multispeaker(config)
         self.init_multilingual(config)
 
-        self.length_scale = args.length_scale
-        self.noise_scale = args.noise_scale
-        self.inference_noise_scale = args.inference_noise_scale
-        self.inference_noise_scale_dp = args.inference_noise_scale_dp
-        self.noise_scale_dp = args.noise_scale_dp
-        self.max_inference_len = args.max_inference_len
-        self.spec_segment_size = args.spec_segment_size
+        self.length_scale = self.args.length_scale
+        self.noise_scale = self.args.noise_scale
+        self.inference_noise_scale = self.args.inference_noise_scale
+        self.inference_noise_scale_dp = self.args.inference_noise_scale_dp
+        self.noise_scale_dp = self.args.noise_scale_dp
+        self.max_inference_len = self.args.max_inference_len
+        self.spec_segment_size = self.args.spec_segment_size
 
         self.text_encoder = TextEncoder(
-            args.num_chars,
-            args.hidden_channels,
-            args.hidden_channels,
-            args.hidden_channels_ffn_text_encoder,
-            args.num_heads_text_encoder,
-            args.num_layers_text_encoder,
-            args.kernel_size_text_encoder,
-            args.dropout_p_text_encoder,
-            language_emb_dim=self.embedded_language_dim,
+            self.args.num_chars,
+            self.args.hidden_channels,
+            self.args.hidden_channels,
+            self.args.hidden_channels_ffn_text_encoder,
+            self.args.num_heads_text_encoder,
+            self.args.num_layers_text_encoder,
+            self.args.kernel_size_text_encoder,
+            self.args.dropout_p_text_encoder,
         )
 
         self.posterior_encoder = PosteriorEncoder(
-            args.out_channels,
-            args.hidden_channels,
-            args.hidden_channels,
-            kernel_size=args.kernel_size_posterior_encoder,
-            dilation_rate=args.dilation_rate_posterior_encoder,
-            num_layers=args.num_layers_posterior_encoder,
+            self.args.out_channels,
+            self.args.hidden_channels,
+            self.args.hidden_channels,
+            kernel_size=self.args.kernel_size_posterior_encoder,
+            dilation_rate=self.args.dilation_rate_posterior_encoder,
+            num_layers=self.args.num_layers_posterior_encoder,
             cond_channels=self.embedded_speaker_dim,
         )
 
         self.flow = ResidualCouplingBlocks(
-            args.hidden_channels,
-            args.hidden_channels,
-            kernel_size=args.kernel_size_flow,
-            dilation_rate=args.dilation_rate_flow,
-            num_layers=args.num_layers_flow,
+            self.args.hidden_channels,
+            self.args.hidden_channels,
+            kernel_size=self.args.kernel_size_flow,
+            dilation_rate=self.args.dilation_rate_flow,
+            num_layers=self.args.num_layers_flow,
             cond_channels=self.embedded_speaker_dim,
         )
 
-        if args.use_sdp:
+        if self.args.use_sdp:
             self.duration_predictor = StochasticDurationPredictor(
-                args.hidden_channels,
+                self.args.hidden_channels,
                 192,
                 3,
-                args.dropout_p_duration_predictor,
+                self.args.dropout_p_duration_predictor,
                 4,
                 cond_channels=self.embedded_speaker_dim,
                 language_emb_dim=self.embedded_language_dim,
             )
         else:
             self.duration_predictor = DurationPredictor(
-                args.hidden_channels,
+                self.args.hidden_channels,
                 256,
                 3,
-                args.dropout_p_duration_predictor,
+                self.args.dropout_p_duration_predictor,
                 cond_channels=self.embedded_speaker_dim,
-                language_emb_dim=self.embedded_language_dim,
             )
 
         self.waveform_decoder = HifiganGenerator(
-            args.hidden_channels,
+            self.args.hidden_channels,
             1,
-            args.resblock_type_decoder,
-            args.resblock_dilation_sizes_decoder,
-            args.resblock_kernel_sizes_decoder,
-            args.upsample_kernel_sizes_decoder,
-            args.upsample_initial_channel_decoder,
-            args.upsample_rates_decoder,
+            self.args.resblock_type_decoder,
+            self.args.resblock_dilation_sizes_decoder,
+            self.args.resblock_kernel_sizes_decoder,
+            self.args.upsample_kernel_sizes_decoder,
+            self.args.upsample_initial_channel_decoder,
+            self.args.upsample_rates_decoder,
             inference_padding=0,
             cond_channels=self.embedded_speaker_dim,
             conv_pre_weight_norm=False,
@@ -372,8 +362,8 @@ def __init__(
             conv_post_bias=False,
         )
 
-        if args.init_discriminator:
-            self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator)
+        if self.args.init_discriminator:
+            self.disc = VitsDiscriminator(use_spectral_norm=self.args.use_spectral_norm_disriminator)
 
     def init_multispeaker(self, config: Coqpit):
         """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
@@ -878,19 +868,17 @@ def train_log(
         Returns:
             Tuple[Dict, np.ndarray]: training plots and output waveform.
         """
-        ap = assets["audio_processor"]
-        self._log(ap, batch, outputs, "train")
+        self._log(self.ap, batch, outputs, "train")
 
     @torch.no_grad()
     def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int):
         return self.train_step(batch, criterion, optimizer_idx)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        ap = assets["audio_processor"]
-        return self._log(ap, batch, outputs, "eval")
+        return self._log(self.ap, batch, outputs, "eval")
 
     @torch.no_grad()
-    def test_run(self, ap) -> Tuple[Dict, Dict]:
+    def test_run(self) -> Tuple[Dict, Dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -985,36 +973,6 @@ def get_criterion(self):
 
         return [VitsGeneratorLoss(self.config), VitsDiscriminatorLoss(self.config)]
 
-    @staticmethod
-    def make_symbols(config):
-        """Create a custom arrangement of symbols used by the model. The output list of symbols propagate along the
-        whole training and inference steps."""
-        _pad = config.characters["pad"]
-        _punctuations = config.characters["punctuations"]
-        _letters = config.characters["characters"]
-        _letters_ipa = config.characters["phonemes"]
-        symbols = [_pad] + list(_punctuations) + list(_letters)
-        if config.use_phonemes:
-            symbols += list(_letters_ipa)
-        return symbols
-
-    @staticmethod
-    def get_characters(config: Coqpit):
-        if config.characters is not None:
-            symbols = Vits.make_symbols(config)
-        else:
-            from TTS.tts.utils.text.symbols import (  # pylint: disable=import-outside-toplevel
-                parse_symbols,
-                phonemes,
-                symbols,
-            )
-
-            config.characters = parse_symbols()
-            if config.use_phonemes:
-                symbols = phonemes
-        num_chars = len(symbols) + getattr(config, "add_blank", False)
-        return symbols, config, num_chars
-
     def load_checkpoint(
         self, config, checkpoint_path, eval=False
     ):  # pylint: disable=unused-argument, redefined-builtin
@@ -1030,23 +988,65 @@ def load_checkpoint(
             assert not self.training
 
     @staticmethod
-    def init_from_config(config: "Coqpit"):
-        """Initialize model from config."""
+    def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None):
+        """Initiate model from config
 
-        # init characters
-        if config.use_phonemes:
-            from TTS.tts.utils.text.characters import IPAPhonemes
+        Args:
+            config (VitsConfig): Model config.
+            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+                Defaults to None.
+        """
+        from TTS.utils.audio import AudioProcessor
 
-            characters = IPAPhonemes().init_from_config(config)
-        else:
-            from TTS.tts.utils.text.characters import Graphemes
+        ap = AudioProcessor.init_from_config(config)
+        tokenizer, new_config = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(config, samples)
+        return Vits(new_config, ap, tokenizer, speaker_manager)
 
-            characters = Graphemes().init_from_config(config)
-        config.num_chars = characters.num_chars
 
-        from TTS.utils.audio import AudioProcessor
+class VitsCharacters(BaseCharacters):
+    """Characters class for VITs model for compatibility with pre-trained models"""
 
-        ap = AudioProcessor.init_from_config(config)
-        tokenizer = TTSTokenizer.init_from_config(config)
-        speaker_manager = SpeakerManager.init_from_config(config)
-        return Vits(config, ap, tokenizer, speaker_manager)
+    def __init__(
+        self,
+        graphemes: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        ipa_characters: str = _phonemes,
+    ) -> None:
+        if ipa_characters is not None:
+            graphemes += ipa_characters
+        super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
+
+    def _create_vocab(self):
+        self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        # pylint: disable=unnecessary-comprehension
+        self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
+
+    @staticmethod
+    def init_from_config(config: Coqpit):
+        if config.characters is not None:
+            _pad = config.characters["pad"]
+            _punctuations = config.characters["punctuations"]
+            _letters = config.characters["characters"]
+            _letters_ipa = config.characters["phonemes"]
+            return (
+                VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
+                config,
+            )
+        characters = VitsCharacters()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=None,
+            bos=None,
+            blank=self._blank,
+            is_unique=False,
+            is_sorted=True,
+        )

From 160115bbdec9bb45eb970abef12e721e0c0b3c99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:55:45 +0000
Subject: [PATCH 16/67] Update Tacotron models

---
 TTS/tts/models/base_tacotron.py | 22 +++++++++++++++--
 TTS/tts/models/tacotron.py      | 43 +++++++++++++++++++++++---------
 TTS/tts/models/tacotron2.py     | 44 +++++++++++++++++++++++----------
 3 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py
index ca8f3bb9ed..54939c61c1 100644
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@@ -9,6 +9,8 @@
 from TTS.tts.layers.losses import TacotronLoss
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import sequence_mask
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.generic_utils import format_aux_input
 from TTS.utils.io import load_fsspec
 from TTS.utils.training import gradual_training_scheduler
@@ -17,8 +19,14 @@
 class BaseTacotron(BaseTTS):
     """Base class shared by Tacotron and Tacotron2"""
 
-    def __init__(self, config: Coqpit):
-        super().__init__(config)
+    def __init__(
+        self,
+        config: "TacotronConfig",
+        ap: "AudioProcessor",
+        tokenizer: "TTSTokenizer",
+        speaker_manager: SpeakerManager = None,
+    ):
+        super().__init__(config, ap, tokenizer, speaker_manager)
 
         # pass all config fields as class attributes
         for key in config:
@@ -107,6 +115,16 @@ def get_criterion(self) -> nn.Module:
         """Get the model criterion used in training."""
         return TacotronLoss(self.config)
 
+    @staticmethod
+    def init_from_config(config: Coqpit):
+        """Initialize model from config."""
+        from TTS.utils.audio import AudioProcessor
+
+        ap = AudioProcessor.init_from_config(config)
+        tokenizer = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(config)
+        return BaseTacotron(config, ap, tokenizer, speaker_manager)
+
     #############################
     # COMMON COMPUTE FUNCTIONS
     #############################
diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
index 4e46d252bf..8341f5bbd2 100644
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@@ -1,7 +1,8 @@
 # coding: utf-8
 
+from typing import Dict, List, Union
+
 import torch
-from coqpit import Coqpit
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 
@@ -10,6 +11,7 @@
 from TTS.tts.models.base_tacotron import BaseTacotron
 from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 
 
@@ -24,12 +26,15 @@ class Tacotron(BaseTacotron):
             a multi-speaker model. Defaults to None.
     """
 
-    def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
-        super().__init__(config)
+    def __init__(
+        self,
+        config: "TacotronConfig",
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: SpeakerManager = None,
+    ):
 
-        self.speaker_manager = speaker_manager
-        chars, self.config, _ = self.get_characters(config)
-        config.num_chars = self.num_chars = len(chars)
+        super().__init__(config, ap, tokenizer, speaker_manager)
 
         # pass all config fields to `self`
         # for fewer code change
@@ -302,16 +307,30 @@ def _create_logs(self, batch, outputs, ap):
     def train_log(
         self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
     ) -> None:  # pylint: disable=no-self-use
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
-        logger.train_audios(steps, audios, ap.sample_rate)
+        logger.train_audios(steps, audios, self.ap.sample_rate)
 
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.eval_figures(steps, figures)
-        logger.eval_audios(steps, audios, ap.sample_rate)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
+
+    @staticmethod
+    def init_from_config(config: "TacotronConfig", samples: Union[List[List], List[Dict]] = None):
+        """Initiate model from config
+
+        Args:
+            config (TacotronConfig): Model config.
+            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+                Defaults to None.
+        """
+        from TTS.utils.audio import AudioProcessor
+
+        ap = AudioProcessor.init_from_config(config)
+        tokenizer, new_config = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(config, samples)
+        return Tacotron(new_config, ap, tokenizer, speaker_manager)
diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index ead3bf2b8e..d4e665e347 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -1,9 +1,8 @@
 # coding: utf-8
 
-from typing import Dict
+from typing import Dict, List, Union
 
 import torch
-from coqpit import Coqpit
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 
@@ -12,6 +11,7 @@
 from TTS.tts.models.base_tacotron import BaseTacotron
 from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 
 
@@ -40,12 +40,16 @@ class Tacotron2(BaseTacotron):
             Speaker manager for multi-speaker training. Uuse only for multi-speaker training. Defaults to None.
     """
 
-    def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
-        super().__init__(config)
+    def __init__(
+        self,
+        config: "Tacotron2Config",
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: SpeakerManager = None,
+    ):
+
+        super().__init__(config, ap, tokenizer, speaker_manager)
 
-        self.speaker_manager = speaker_manager
-        chars, self.config, _ = self.get_characters(config)
-        config.num_chars = len(chars)
         self.decoder_output_dim = config.out_channels
 
         # pass all config fields to `self`
@@ -325,16 +329,30 @@ def train_log(
         self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
     ) -> None:  # pylint: disable=no-self-use
         """Log training progress."""
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
-        logger.train_audios(steps, audios, ap.sample_rate)
+        logger.train_audios(steps, audios, self.ap.sample_rate)
 
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.eval_figures(steps, figures)
-        logger.eval_audios(steps, audios, ap.sample_rate)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
+
+    @staticmethod
+    def init_from_config(config: "Tacotron2Config", samples: Union[List[List], List[Dict]] = None):
+        """Initiate model from config
+
+        Args:
+            config (Tacotron2Config): Model config.
+            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+                Defaults to None.
+        """
+        from TTS.utils.audio import AudioProcessor
+
+        ap = AudioProcessor.init_from_config(config)
+        tokenizer, new_config = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(new_config, samples)
+        return Tacotron2(new_config, ap, tokenizer, speaker_manager)

From 0ff11d4a356e6e99a8deb3af35bec53adcd830fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:56:16 +0000
Subject: [PATCH 17/67] Update ForwardTTS

---
 TTS/tts/models/base_tts.py    | 19 +++++++----------
 TTS/tts/models/forward_tts.py | 40 ++++++++++++++++++++++++++---------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 272317905b..5986232231 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -1,6 +1,6 @@
 import os
 import random
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -56,9 +56,10 @@ def _set_model_args(self, config: Coqpit):
         """
         # don't use isintance not to import recursively
         if "Config" in config.__class__.__name__:
-            num_chars = (
-                self.config.model_args.num_chars if self.tokenizer is None else self.tokenizer.characters.num_chars
+            config_num_chars = (
+                self.config.model_args.num_chars if hasattr(self.config, "model_args") else self.config.num_chars
             )
+            num_chars = config_num_chars if self.tokenizer is None else self.tokenizer.characters.num_chars
             if "characters" in config:
                 self.config.num_chars = num_chars
                 if hasattr(self.config, "model_args"):
@@ -237,7 +238,7 @@ def get_data_loader(
         config: Coqpit,
         assets: Dict,
         is_eval: bool,
-        data_items: List,
+        samples: Union[List[Dict], List[List]],
         verbose: bool,
         num_gpus: int,
         rank: int = None,
@@ -274,7 +275,7 @@ def get_data_loader(
                 compute_linear_spec=config.model.lower() == "tacotron" or config.compute_linear_spec,
                 compute_f0=config.get("compute_f0", False),
                 f0_cache_path=config.get("f0_cache_path", None),
-                meta_data=data_items,
+                samples=samples,
                 ap=self.ap,
                 return_wav=config.return_wav if "return_wav" in config else False,
                 batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
@@ -283,6 +284,7 @@ def get_data_loader(
                 min_audio_len=config.min_audio_len,
                 max_audio_len=config.max_audio_len,
                 phoneme_cache_path=config.phoneme_cache_path,
+                precompute_num_workers=config.precompute_num_workers,
                 use_noise_augment=False if is_eval else config.use_noise_augment,
                 verbose=verbose,
                 speaker_id_mapping=speaker_id_mapping,
@@ -357,8 +359,6 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
         Returns:
             Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
         """
-        ap = assets["audio_processor"]
-        tokenizer = assets["tokenizer"]
         print(" | > Synthesizing test sentences.")
         test_audios = {}
         test_figures = {}
@@ -370,18 +370,15 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
                 sen,
                 self.config,
                 "cuda" in str(next(self.parameters()).device),
-                ap,
-                tokenizer,
                 speaker_id=aux_inputs["speaker_id"],
                 d_vector=aux_inputs["d_vector"],
                 style_wav=aux_inputs["style_wav"],
-                enable_eos_bos_chars=self.config.enable_eos_bos_chars,
                 use_griffin_lim=True,
                 do_trim_silence=False,
             )
             test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
             test_figures["{}-prediction".format(idx)] = plot_spectrogram(
-                outputs_dict["outputs"]["model_outputs"], ap, output_fig=False
+                outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
             )
             test_figures["{}-alignment".format(idx)] = plot_alignment(
                 outputs_dict["outputs"]["alignments"], output_fig=False
diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py
index b2c41df5e0..699f31426c 100644
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Dict, Tuple
+from typing import Dict, List, Tuple, Union
 
 import torch
 from coqpit import Coqpit
@@ -14,6 +14,7 @@
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
 
 
@@ -170,11 +171,16 @@ class ForwardTTS(BaseTTS):
     """
 
     # pylint: disable=dangerous-default-value
-    def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
+    def __init__(
+        self,
+        config: Coqpit,
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: SpeakerManager = None,
+    ):
 
-        super().__init__(config)
+        super().__init__(config, ap, tokenizer, speaker_manager)
 
-        self.speaker_manager = speaker_manager
         self.init_multispeaker(config)
 
         self.max_duration = self.args.max_duration
@@ -692,19 +698,17 @@ def _create_logs(self, batch, outputs, ap):
     def train_log(
         self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
     ) -> None:  # pylint: disable=no-self-use
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
-        logger.train_audios(steps, audios, ap.sample_rate)
+        logger.train_audios(steps, audios, self.ap.sample_rate)
 
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.eval_figures(steps, figures)
-        logger.eval_audios(steps, audios, ap.sample_rate)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     def load_checkpoint(
         self, config, checkpoint_path, eval=False
@@ -724,3 +728,19 @@ def on_train_step_start(self, trainer):
         """Enable binary alignment loss when needed"""
         if trainer.total_steps_done > self.config.binary_align_loss_start_step:
             self.use_binary_alignment_loss = True
+
+    @staticmethod
+    def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None):
+        """Initiate model from config
+
+        Args:
+            config (ForwardTTSConfig): Model config.
+            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+                Defaults to None.
+        """
+        from TTS.utils.audio import AudioProcessor
+
+        ap = AudioProcessor.init_from_config(config)
+        tokenizer, new_config = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(config, samples)
+        return ForwardTTS(new_config, ap, tokenizer, speaker_manager)

From f46ad54b89a5ca28a20fbddbe814a5df21b481c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:56:24 +0000
Subject: [PATCH 18/67] Update AlignTTS

---
 TTS/tts/models/align_tts.py | 43 ++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py
index 2fc00b0b90..c1e2ffb34f 100644
--- a/TTS/tts/models/align_tts.py
+++ b/TTS/tts/models/align_tts.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass, field
+from typing import Dict, List, Union
 
 import torch
 from coqpit import Coqpit
@@ -12,6 +13,7 @@
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.io import load_fsspec
 
@@ -100,11 +102,16 @@ class AlignTTS(BaseTTS):
 
     # pylint: disable=dangerous-default-value
 
-    def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
+    def __init__(
+        self,
+        config: "AlignTTSConfig",
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: SpeakerManager = None,
+    ):
 
-        super().__init__(config)
+        super().__init__(config, ap, tokenizer, speaker_manager)
         self.speaker_manager = speaker_manager
-        self.config = config
         self.phase = -1
         self.length_scale = (
             float(config.model_args.length_scale)
@@ -112,10 +119,6 @@ def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
             else config.model_args.length_scale
         )
 
-        if not self.config.model_args.num_chars:
-            _, self.config, num_chars = self.get_characters(config)
-            self.config.model_args.num_chars = num_chars
-
         self.emb = nn.Embedding(self.config.model_args.num_chars, self.config.model_args.hidden_channels)
 
         self.embedded_speaker_dim = 0
@@ -382,19 +385,17 @@ def _create_logs(self, batch, outputs, ap):  # pylint: disable=no-self-use
     def train_log(
         self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
     ) -> None:  # pylint: disable=no-self-use
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
-        logger.train_audios(steps, audios, ap.sample_rate)
+        logger.train_audios(steps, audios, self.ap.sample_rate)
 
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.eval_figures(steps, figures)
-        logger.eval_audios(steps, audios, ap.sample_rate)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     def load_checkpoint(
         self, config, checkpoint_path, eval=False
@@ -430,3 +431,19 @@ def _set_phase(config, global_step):
     def on_epoch_start(self, trainer):
         """Set AlignTTS training phase on epoch start."""
         self.phase = self._set_phase(trainer.config, trainer.total_steps_done)
+
+    @staticmethod
+    def init_from_config(config: "AlignTTSConfig", samples: Union[List[List], List[Dict]] = None):
+        """Initiate model from config
+
+        Args:
+            config (AlignTTSConfig): Model config.
+            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+                Defaults to None.
+        """
+        from TTS.utils.audio import AudioProcessor
+
+        ap = AudioProcessor.init_from_config(config)
+        tokenizer, new_config = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(config, samples)
+        return AlignTTS(new_config, ap, tokenizer, speaker_manager)

From a8a836578812390f1bb43cbaf39ece64744fce7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:56:31 +0000
Subject: [PATCH 19/67] Update GlowTTS

---
 TTS/tts/models/glow_tts.py | 48 ++++++++++++++------------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
index af440072cd..3dd8d5c836 100644
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@@ -1,5 +1,5 @@
 import math
-from typing import Dict, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 import torch
 from coqpit import Coqpit
@@ -50,8 +50,8 @@ class GlowTTS(BaseTTS):
     def __init__(
         self,
         config: GlowTTSConfig,
-        ap: "AudioProcessor",
-        tokenizer: "TTSTokenizer",
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
         speaker_manager: SpeakerManager = None,
     ):
 
@@ -63,7 +63,6 @@ def __init__(
         for key in config:
             setattr(self, key, config[key])
 
-        self.num_chars = self.tokenizer.characters.num_chars
         self.decoder_output_dim = config.out_channels
 
         # init multi-speaker layers if necessary
@@ -427,20 +426,18 @@ def _create_logs(self, batch, outputs, ap):
     def train_log(
         self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
     ) -> None:  # pylint: disable=no-self-use
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
-        logger.train_audios(steps, audios, ap.sample_rate)
+        logger.train_audios(steps, audios, self.ap.sample_rate)
 
     @torch.no_grad()
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.eval_figures(steps, figures)
-        logger.eval_audios(steps, audios, ap.sample_rate)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @torch.no_grad()
     def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
@@ -465,19 +462,16 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
                     sen,
                     self.config,
                     "cuda" in str(next(self.parameters()).device),
-                    self.ap,
-                    self.tokenizer,
                     speaker_id=aux_inputs["speaker_id"],
                     d_vector=aux_inputs["d_vector"],
                     style_wav=aux_inputs["style_wav"],
-                    enable_eos_bos_chars=self.config.enable_eos_bos_chars,
                     use_griffin_lim=True,
                     do_trim_silence=False,
                 )
 
                 test_audios["{}-audio".format(idx)] = outputs["wav"]
                 test_figures["{}-prediction".format(idx)] = plot_spectrogram(
-                    outputs["outputs"]["model_outputs"], ap, output_fig=False
+                    outputs["outputs"]["model_outputs"], self.ap, output_fig=False
                 )
                 test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False)
         return test_figures, test_audios
@@ -514,23 +508,17 @@ def on_train_step_start(self, trainer):
         self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps
 
     @staticmethod
-    def init_from_config(config: Coqpit):
-        """Initialize model from config."""
-
-        # init characters
-        if config.use_phonemes:
-            from TTS.tts.utils.text.characters import IPAPhonemes
-
-            characters = IPAPhonemes().init_from_config(config)
-        else:
-            from TTS.tts.utils.text.characters import Graphemes
-
-            characters = Graphemes().init_from_config(config)
-        config.num_chars = characters.num_chars
+    def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None):
+        """Initiate model from config
 
+        Args:
+            config (VitsConfig): Model config.
+            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+                Defaults to None.
+        """
         from TTS.utils.audio import AudioProcessor
 
         ap = AudioProcessor.init_from_config(config)
-        tokenizer = TTSTokenizer.init_from_config(config)
-        speaker_manager = SpeakerManager.init_from_config(config)
-        return GlowTTS(config, ap, tokenizer, speaker_manager)
+        tokenizer, new_config = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(config, samples)
+        return GlowTTS(new_config, ap, tokenizer, speaker_manager)

From 4640d59f39644937d9f18cc83fa3514e0f4791ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:56:44 +0000
Subject: [PATCH 20/67] Update setup_model

---
 TTS/tts/models/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py
index cb1c2e2133..d76a3bebee 100644
--- a/TTS/tts/models/__init__.py
+++ b/TTS/tts/models/__init__.py
@@ -1,12 +1,14 @@
+from typing import Dict, List, Union
+
 from TTS.utils.generic_utils import find_module
 
 
-def setup_model(config: "Coqpit") -> "BaseTTS":
+def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS":
     print(" > Using model: {}".format(config.model))
     # fetch the right model implementation.
     if "base_model" in config and config["base_model"] is not None:
         MyModel = find_module("TTS.tts.models", config.base_model.lower())
     else:
         MyModel = find_module("TTS.tts.models", config.model.lower())
-    model = MyModel.init_from_config(config)
+    model = MyModel.init_from_config(config, samples)
     return model

From f8fbbd409f68f0ad8e0875185e037c0e48fb0128 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:57:51 +0000
Subject: [PATCH 21/67] Update BaseTTS config

---
 TTS/tts/configs/shared_configs.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index 10bee3e6c7..c7958fda00 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -78,7 +78,7 @@ class CharactersConfig(Coqpit):
 
         is_unique (bool):
             remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
-            models trained with character lists with duplicates.
+            models trained with character lists with duplicates. Defaults to True.
 
         is_sorted (bool):
             Sort the characters in alphabetical order. Defaults to True.
@@ -166,6 +166,9 @@ class BaseTTSConfig(BaseTrainingConfig):
         compute_linear_spec (bool):
             If True data loader computes and returns linear spectrograms alongside the other data.
 
+        precompute_num_workers (int):
+            Number of workers to precompute features. Defaults to 0.
+
         use_noise_augment (bool):
             Augment the input audio with random noise.
 
@@ -207,6 +210,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     phoneme_cache_path: str = None
     # vocabulary parameters
     characters: CharactersConfig = None
+    add_blank: bool = False
     # training params
     batch_group_size: int = 0
     loss_masking: bool = None
@@ -218,8 +222,8 @@ class BaseTTSConfig(BaseTrainingConfig):
     max_text_len: int = float("inf")
     compute_f0: bool = False
     compute_linear_spec: bool = False
+    precompute_num_workers: int = 0
     use_noise_augment: bool = False
-    add_blank: bool = False
     # dataset
     datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # optimizer

From ab413fda6aa2883a08625f0efe24a08f157b9d5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:58:08 +0000
Subject: [PATCH 22/67] Update train_tts.py

---
 TTS/bin/train_tts.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
index 3360a94051..f053e9d75c 100644
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tts.py
@@ -81,7 +81,6 @@ def main():
         model=model,
         train_samples=train_samples,
         eval_samples=eval_samples,
-        training_assets={"audio_processor": ap},
         parse_command_line_args=False,
     )
     trainer.fit()

From cee01a66e1fd24f5b7b37e3ca51b24fdb0fd5e0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:58:41 +0000
Subject: [PATCH 23/67] Update ljspeech recipes

---
 recipes/ljspeech/align_tts/train_aligntts.py  | 37 ++++++++++------
 .../ljspeech/fast_pitch/train_fast_pitch.py   | 30 +++++++------
 .../ljspeech/fast_speech/train_fast_speech.py | 30 +++++++------
 recipes/ljspeech/glow_tts/train_glowtts.py    |  3 +-
 .../speedy_speech/train_speedy_speech.py      | 43 +++++++++++--------
 .../tacotron2-DCA/train_tacotron_dca.py       | 39 +++++++++++------
 .../tacotron2-DDC/train_tacotron_ddc.py       | 25 +++++++++--
 recipes/ljspeech/vits_tts/train_vits.py       |  7 +--
 recipes/vctk/vits/train_vits.py               | 21 ++++++---
 9 files changed, 155 insertions(+), 80 deletions(-)

diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py
index 68b67d66e7..d0187aa816 100644
--- a/recipes/ljspeech/align_tts/train_aligntts.py
+++ b/recipes/ljspeech/align_tts/train_aligntts.py
@@ -1,9 +1,11 @@
 import os
 
 from TTS.trainer import Trainer, TrainingArgs
-from TTS.tts.configs.align_tts_config import AlignTTSConfig, BaseDatasetConfig
+from TTS.tts.configs.align_tts_config import AlignTTSConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.align_tts import AlignTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -31,23 +33,32 @@
     datasets=[dataset_config],
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio.to_dict())
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init model
-model = AlignTTS(config)
+model = AlignTTS(config, ap, tokenizer)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
+
+# AND... 3,2,1... 🚀
 trainer.fit()
diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
index 0a4a965b63..3a772251c8 100644
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@@ -5,6 +5,7 @@
 from TTS.tts.configs.fast_pitch_config import FastPitchConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.forward_tts import ForwardTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.manage import ModelManager
 
@@ -46,9 +47,9 @@
     epochs=1000,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=False,
     phoneme_language="en-us",
     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    precompute_num_workers=4,
     print_step=50,
     print_eval=False,
     mixed_precision=False,
@@ -67,23 +68,28 @@
         f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda true"
     )
 
-# init audio processor
-ap = AudioProcessor(**config.audio)
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init the model
-model = ForwardTTS(config)
+model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)
 
 # init the trainer and 🚀
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py
index a71da94bae..f9f1bc0605 100644
--- a/recipes/ljspeech/fast_speech/train_fast_speech.py
+++ b/recipes/ljspeech/fast_speech/train_fast_speech.py
@@ -5,6 +5,7 @@
 from TTS.tts.configs.fast_speech_config import FastSpeechConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.forward_tts import ForwardTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.manage import ModelManager
 
@@ -45,9 +46,9 @@
     epochs=1000,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=False,
     phoneme_language="en-us",
     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    precompute_num_workers=8,
     print_step=50,
     print_eval=False,
     mixed_precision=False,
@@ -66,23 +67,28 @@
         f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda true"
     )
 
-# init audio processor
-ap = AudioProcessor(**config.audio)
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init the model
-model = ForwardTTS(config)
+model = ForwardTTS(config, ap, tokenizer)
 
 # init the trainer and 🚀
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py
index 4762a77aa1..dd450a572f 100644
--- a/recipes/ljspeech/glow_tts/train_glowtts.py
+++ b/recipes/ljspeech/glow_tts/train_glowtts.py
@@ -52,7 +52,8 @@
 
 # INITIALIZE THE TOKENIZER
 # Tokenizer is used to convert text to sequences of token IDs.
-tokenizer = TTSTokenizer.init_from_config(config)
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
 
 # LOAD DATA SAMPLES
 # Each sample is a list of ```[text, audio_file_path, speaker_name]```
diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
index 6b9683afb6..468e8a5f12 100644
--- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py
+++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@@ -5,6 +5,7 @@
 from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.forward_tts import ForwardTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -38,9 +39,9 @@
     epochs=1000,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=False,
     phoneme_language="en-us",
     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    precompute_num_workers=4,
     print_step=50,
     print_eval=False,
     mixed_precision=False,
@@ -50,14 +51,22 @@
     datasets=[dataset_config],
 )
 
-# # compute alignments
-# if not config.model_args.use_aligner:
-#     manager = ModelManager()
-#     model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
-#     # TODO: make compute_attention python callable
-#     os.system(
-#         f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda true"
-#     )
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())
@@ -66,16 +75,14 @@
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init model
-model = ForwardTTS(config)
+model = ForwardTTS(config, ap, tokenizer)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
+
+# AND... 3,2,1... 🚀
 trainer.fit()
diff --git a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
index cf00ccc2b4..97a16ab7fb 100644
--- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
+++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
@@ -6,6 +6,7 @@
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.tacotron2 import Tacotron2
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 # from TTS.tts.datasets.tokenizer import Tokenizer
@@ -54,23 +55,35 @@
     datasets=[dataset_config],
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio.to_dict())
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
-# init model
-model = Tacotron2(config)
+# INITIALIZE THE MODEL
+# Models take a config object and a speaker manager as input
+# Config defines the details of the model like the number of layers, the size of the embedding, etc.
+# Speaker manager is used by multi-speaker models.
+model = Tacotron2(config, ap, tokenizer)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
+
+# AND... 3,2,1... 🚀
 trainer.fit()
diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
index b452094af2..285c416c58 100644
--- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
@@ -6,6 +6,7 @@
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.tacotron2 import Tacotron2
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 # from TTS.tts.datasets.tokenizer import Tokenizer
@@ -46,6 +47,7 @@
     use_phonemes=True,
     phoneme_language="en-us",
     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    precompute_num_workers=8,
     print_step=25,
     print_eval=True,
     mixed_precision=False,
@@ -56,11 +58,28 @@
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())
 
-# load training samples
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
-# init model
-model = Tacotron2(config)
+# INITIALIZE THE MODEL
+# Models take a config object and a speaker manager as input
+# Config defines the details of the model like the number of layers, the size of the embedding, etc.
+# Speaker manager is used by multi-speaker models.
+model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
 
 # init the trainer and 🚀
 trainer = Trainer(
diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py
index 0588e9d9cf..79c0db2e9f 100644
--- a/recipes/ljspeech/vits_tts/train_vits.py
+++ b/recipes/ljspeech/vits_tts/train_vits.py
@@ -33,7 +33,7 @@
 config = VitsConfig(
     audio=audio_config,
     run_name="vits_ljspeech",
-    batch_size=48,
+    batch_size=16,
     eval_batch_size=16,
     batch_group_size=5,
     num_loader_workers=0,
@@ -48,7 +48,7 @@
     compute_input_seq_cache=True,
     print_step=25,
     print_eval=True,
-    mixed_precision=True,
+    mixed_precision=False,
     max_seq_len=500000,
     output_path=output_path,
     datasets=[dataset_config],
@@ -61,7 +61,8 @@
 
 # INITIALIZE THE TOKENIZER
 # Tokenizer is used to convert text to sequences of token IDs.
-tokenizer = TTSTokenizer.init_from_config(config)
+# config is updated with the default characters if not defined in the config.
+tokenizer, config = TTSTokenizer.init_from_config(config)
 
 # LOAD DATA SAMPLES
 # Each sample is a list of ```[text, audio_file_path, speaker_name]```
diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py
index 7eb741c4d9..2906557dde 100644
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@@ -7,6 +7,7 @@
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.vits import Vits, VitsArgs
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -63,10 +64,21 @@
     datasets=[dataset_config],
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio.to_dict())
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# config is updated with the default characters if not defined in the config.
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init speaker manager for multi-speaker training
@@ -76,7 +88,7 @@
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 # init model
-model = Vits(config, speaker_manager)
+model = Vits(config, ap, tokenizer, speaker_manager)
 
 # init the trainer and 🚀
 trainer = Trainer(
@@ -86,6 +98,5 @@
     model=model,
     train_samples=train_samples,
     eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
 )
 trainer.fit()

From e9448ca33901c176f5e765e4f97065958bed8136 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:58:55 +0000
Subject: [PATCH 24/67] Update loader tests

---
 tests/data_tests/test_loader.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
index ac850a1440..f2f2a8d238 100644
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@@ -39,7 +39,7 @@ def __init__(self, *args, **kwargs):
 
     def _create_dataloader(self, batch_size, r, bgs):
         items = ljspeech(c.data_path, "metadata.csv")
-        tokenizer = TTSTokenizer.init_from_config(c)
+        tokenizer, _ = TTSTokenizer.init_from_config(c)
         dataset = TTSDataset(
             outputs_per_step=r,
             compute_linear_spec=True,
@@ -101,8 +101,8 @@ def test_loader(self):
                 if self.ap.symmetric_norm:
                     self.assertLessEqual(mel_input.max(), self.ap.max_norm)
                     self.assertGreaterEqual(
-                        mel_input.min(), -self.ap.max_norm
-                    )  # pylint: disable=invalid-unary-operand-type
+                        mel_input.min(), -self.ap.max_norm  # pylint: disable=invalid-unary-operand-type
+                    )
                     self.assertLess(mel_input.min(), 0)
                 else:
                     self.assertLessEqual(mel_input.max(), self.ap.max_norm)

From c974633aa9d5254714ddf032aa1b441d31177745 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:59:11 +0000
Subject: [PATCH 25/67] Update tests

---
 tests/inference_tests/test_synthesize.py | 12 ++--
 tests/text_tests/test_characters.py      |  4 +-
 tests/text_tests/test_phonemizer.py      | 85 +++++++++++++++++++++---
 tests/text_tests/test_tokenizer.py       | 14 ++--
 tests/tts_tests/test_glow_tts_train.py   |  1 -
 tests/tts_tests/test_vits_train.py       |  1 -
 6 files changed, 90 insertions(+), 27 deletions(-)

diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py
index 635506ab21..42b7717281 100644
--- a/tests/inference_tests/test_synthesize.py
+++ b/tests/inference_tests/test_synthesize.py
@@ -19,9 +19,9 @@ def test_synthesize():
         f'--text "This is an example." --out_path "{output_path}"'
     )
 
-    # multi-speaker model
-    run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs")
-    run_cli(
-        f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" '
-        f'--text "This is an example." --out_path "{output_path}"'
-    )
+    # multi-speaker SC-Glow model
+    # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs")
+    # run_cli(
+    #     f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" '
+    #     f'--text "This is an example." --out_path "{output_path}"'
+    # )
diff --git a/tests/text_tests/test_characters.py b/tests/text_tests/test_characters.py
index ed84b5b40b..3f4086d59e 100644
--- a/tests/text_tests/test_characters.py
+++ b/tests/text_tests/test_characters.py
@@ -2,6 +2,8 @@
 
 from TTS.tts.utils.text.characters import BaseCharacters, Graphemes, IPAPhonemes, create_graphemes, create_phonemes
 
+# pylint: disable=protected-access
+
 
 def test_make_symbols():
     _ = create_phonemes()
@@ -12,7 +14,7 @@ class BaseCharacterTest(unittest.TestCase):
     def setUp(self):
         self.characters_empty = BaseCharacters("", "", pad="", eos="", bos="", blank="", is_unique=True, is_sorted=True)
 
-    def test_default_character_sets(self):
+    def test_default_character_sets(self):  # pylint: disable=no-self-use
         """Test initiation of default character sets"""
         _ = IPAPhonemes()
         _ = Graphemes()
diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py
index aa7a54991a..512cc195f3 100644
--- a/tests/text_tests/test_phonemizer.py
+++ b/tests/text_tests/test_phonemizer.py
@@ -1,20 +1,38 @@
 import unittest
 
-from TTS.tts.utils.text.characters import BaseCharacters, Graphemes, IPAPhonemes, create_graphemes, create_phonemes
 from TTS.tts.utils.text.phonemizers import ESpeak, Gruut, JA_JP_Phonemizer, ZH_CN_Phonemizer
-from TTS.tts.utils.text.tokenizer import TTSTokenizer
 
-EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
+EXAMPLE_TEXTs = [
+    "Recent research at Harvard has shown meditating",
+    "for as little as 8 weeks can actually increase, the grey matter",
+    "in the parts of the brain responsible",
+    "for emotional regulation and learning!",
+]
+
+
+EXPECTED_ESPEAK_PHONEMES = [
+    "ɹ|ˈiː|s|ə|n|t ɹ|ɪ|s|ˈɜː|tʃ æ|t h|ˈɑːɹ|v|ɚ|d h|ɐ|z ʃ|ˈoʊ|n m|ˈɛ|d|ɪ|t|ˌeɪ|ɾ|ɪ|ŋ",
+    "f|ɔː|ɹ æ|z l|ˈɪ|ɾ|əl æ|z ˈeɪ|t w|ˈiː|k|s k|æ|n ˈæ|k|tʃ|uː|əl|i| ˈɪ|n|k|ɹ|iː|s, ð|ə ɡ|ɹ|ˈeɪ m|ˈæ|ɾ|ɚ",
+    "ɪ|n|ð|ə p|ˈɑːɹ|t|s ʌ|v|ð|ə b|ɹ|ˈeɪ|n ɹ|ɪ|s|p|ˈɑː|n|s|ə|b|əl",
+    "f|ɔː|ɹ ɪ|m|ˈoʊ|ʃ|ə|n|əl ɹ|ˌɛ|ɡ|j|uː|l|ˈeɪ|ʃ|ə|n|| æ|n|d l|ˈɜː|n|ɪ|ŋ!",
+]
+
+
+EXPECTED_ESPEAKNG_PHONEMES = [
+    "ɹ|ˈiː|s|ə|n|t ɹ|ᵻ|s|ˈɜː|tʃ æ|t h|ˈɑːɹ|v|ɚ|d h|ɐ|z ʃ|ˈoʊ|n m|ˈɛ|d|ᵻ|t|ˌeɪ|ɾ|ɪ|ŋ",
+    "f|ɔː|ɹ æ|z l|ˈɪ|ɾ|əl æ|z ˈeɪ|t w|ˈiː|k|s k|æ|n ˈæ|k|tʃ|uː|əl|i| ˈɪ|ŋ|k|ɹ|iː|s, ð|ə ɡ|ɹ|ˈeɪ m|ˈæ|ɾ|ɚ",
+    "ɪ|n|ð|ə p|ˈɑːɹ|t|s ʌ|v|ð|ə b|ɹ|ˈeɪ|n ɹ|ᵻ|s|p|ˈɑː|n|s|ᵻ|b|əl",
+    "f|ɔː|ɹ ɪ|m|ˈoʊ|ʃ|ə|n|əl ɹ|ˌɛ|ɡ|j|ʊ|l|ˈeɪ|ʃ|ə|n|| æ|n|d l|ˈɜː|n|ɪ|ŋ!",
+]
 
 
 class TestEspeakPhonemizer(unittest.TestCase):
     def setUp(self):
-        self.phonemizer = ESpeak(language="en-us")
-        self.EXPECTED_PHONEMES = "ɹ|ˈiː|s|ə|n|t ɹ|ɪ|s|ˈɜː|tʃ æ|t h|ˈɑːɹ|v|ɚ|d h|ɐ|z ʃ|ˈoʊ|n m|ˈɛ|d|ᵻ|t|ˌeɪ|ɾ|ɪ|ŋ f|ɔː|ɹ æ|z l|ˈɪ|ɾ|əl æ|z ˈeɪ|t w|ˈiː|k|s k|æ|n ˈæ|k|tʃ|uː|əl|i| ˈɪ|n|k|ɹ|iː|s, ð|ə ɡ|ɹ|ˈeɪ m|ˈæ|ɾ|ɚ|ɹ ɪ|n|ð|ə p|ˈɑːɹ|t|s ʌ|v|ð|ə b|ɹ|ˈeɪ|n ɹ|ɪ|s|p|ˈɑː|n|s|ə|b|əl f|ɔː|ɹ ɪ|m|ˈoʊ|ʃ|ə|n|əl ɹ|ˌɛ|ɡ|j|uː|l|ˈeɪ|ʃ|ə|n|| æ|n|d l|ˈɜː|n|ɪ|ŋ!"
+        self.phonemizer = ESpeak(language="en-us", backend="espeak")
 
-    def test_phonemize(self):
-        output = self.phonemizer.phonemize(EXAMPLE_TEXT, separator="|")
-        self.assertEqual(output, self.EXPECTED_PHONEMES)
+        for text, ph in zip(EXAMPLE_TEXTs, EXPECTED_ESPEAK_PHONEMES):
+            phonemes = self.phonemizer.phonemize(text)
+            self.assertEqual(phonemes, ph)
 
         # multiple punctuations
         text = "Be a voice, not an! echo?"
@@ -48,14 +66,59 @@ def test_is_available(self):
         self.assertTrue(self.phonemizer.is_available())
 
 
+class TestEspeakNgPhonemizer(unittest.TestCase):
+    def setUp(self):
+        self.phonemizer = ESpeak(language="en-us", backend="espeak-ng")
+
+        for text, ph in zip(EXAMPLE_TEXTs, EXPECTED_ESPEAKNG_PHONEMES):
+            phonemes = self.phonemizer.phonemize(text)
+            self.assertEqual(phonemes, ph)
+
+        # multiple punctuations
+        text = "Be a voice, not an! echo?"
+        gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ?"
+        output = self.phonemizer.phonemize(text, separator="|")
+        output = output.replace("|", "")
+        self.assertEqual(output, gt)
+
+        # not ending with punctuation
+        text = "Be a voice, not an! echo"
+        gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ"
+        output = self.phonemizer.phonemize(text, separator="")
+        self.assertEqual(output, gt)
+
+        # extra space after the sentence
+        text = "Be a voice, not an! echo.  "
+        gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ."
+        output = self.phonemizer.phonemize(text, separator="")
+        self.assertEqual(output, gt)
+
+    def test_name(self):
+        self.assertEqual(self.phonemizer.name(), "espeak")
+
+    def test_get_supported_languages(self):
+        self.assertIsInstance(self.phonemizer.supported_languages(), dict)
+
+    def test_get_version(self):
+        self.assertIsInstance(self.phonemizer.version(), str)
+
+    def test_is_available(self):
+        self.assertTrue(self.phonemizer.is_available())
+
+
 class TestGruutPhonemizer(unittest.TestCase):
     def setUp(self):
         self.phonemizer = Gruut(language="en-us", use_espeak_phonemes=True, keep_stress=False)
-        self.EXPECTED_PHONEMES = "ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ| f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s, ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l| f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ!"
+        self.EXPECTED_PHONEMES = ["ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ",
+                                  "f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s, ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ",
+                                  "ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l",
+                                  "f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ!"
+        ]
 
     def test_phonemize(self):
-        output = self.phonemizer.phonemize(EXAMPLE_TEXT, separator="|")
-        self.assertEqual(output, self.EXPECTED_PHONEMES)
+        for text, ph in zip(EXAMPLE_TEXTs, self.EXPECTED_PHONEMES):
+            phonemes = self.phonemizer.phonemize(text, separator="|")
+            self.assertEqual(phonemes, ph)
 
         # multiple punctuations
         text = "Be a voice, not an! echo?"
diff --git a/tests/text_tests/test_tokenizer.py b/tests/text_tests/test_tokenizer.py
index 4d3fb0ce70..471745185f 100644
--- a/tests/text_tests/test_tokenizer.py
+++ b/tests/text_tests/test_tokenizer.py
@@ -1,6 +1,5 @@
 import unittest
 from dataclasses import dataclass
-from os import sep
 
 from coqpit import Coqpit
 
@@ -13,7 +12,7 @@ class TestTTSTokenizer(unittest.TestCase):
     def setUp(self):
         self.tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes())
 
-        self.ph = ESpeak("tr")
+        self.ph = ESpeak("tr", backend="espeak")
         self.tokenizer_ph = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph)
 
     def test_encode_decode_graphemes(self):
@@ -54,12 +53,12 @@ def test_print_logs(self):
 
     def test_not_found_characters(self):
         self.ph = ESpeak("en-us")
-        self.tokenizer_local = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph)
+        tokenizer_local = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph)
         self.assertEqual(len(self.tokenizer.not_found_characters), 0)
         text = "Yolk of one egg beaten light"
-        ids = self.tokenizer_local.text_to_ids(text)
-        text_hat = self.tokenizer_local.ids_to_text(ids)
-        self.assertEqual(self.tokenizer_local.not_found_characters, ["̩"])
+        ids = tokenizer_local.text_to_ids(text)
+        text_hat = tokenizer_local.ids_to_text(ids)
+        self.assertEqual(tokenizer_local.not_found_characters, ["̩"])
         self.assertEqual(text_hat, "jˈoʊk ʌv wˈʌn ˈɛɡ bˈiːʔn lˈaɪt")
 
     def test_init_from_config(self):
@@ -85,7 +84,8 @@ class TokenizerConfig(Coqpit):
             text_cleaner: str = "phoneme_cleaners"
             characters = Characters()
 
-        tokenizer_ph = TTSTokenizer.init_from_config(TokenizerConfig())
+        tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig())
+        tokenizer_ph.phonemizer.backend = "espeak"
         text = "Bu bir Örnek."
         text_ph = "<BOS>" + self.ph.phonemize(text, separator="") + "<EOS>"
         ids = tokenizer_ph.text_to_ids(text)
diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py
index e590107658..7796b76051 100644
--- a/tests/tts_tests/test_glow_tts_train.py
+++ b/tests/tts_tests/test_glow_tts_train.py
@@ -16,7 +16,6 @@
     num_eval_loader_workers=0,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=True,
     phoneme_language="en-us",
     phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,
diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py
index 607f7b29de..25793c0603 100644
--- a/tests/tts_tests/test_vits_train.py
+++ b/tests/tts_tests/test_vits_train.py
@@ -16,7 +16,6 @@
     num_eval_loader_workers=0,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=True,
     phoneme_language="en-us",
     phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,

From 848fd73acaab80ed9372c2c466de37af8b2da349 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:59:28 +0000
Subject: [PATCH 26/67] Update spec extractor

---
 TTS/bin/extract_tts_spectrograms.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
index 7b489fd653..40079f1b46 100755
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -13,6 +13,7 @@
 from TTS.tts.datasets import TTSDataset, load_tts_samples
 from TTS.tts.models import setup_model
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import count_parameters
 
@@ -20,21 +21,20 @@
 
 
 def setup_loader(ap, r, verbose=False):
+    tokenizer, _ = TTSTokenizer.init_from_config(c)
     dataset = TTSDataset(
-        r,
-        c.text_cleaner,
+        outputs_per_step=r,
         compute_linear_spec=False,
-        meta_data=meta_data,
+        samples=meta_data,
+        tokenizer=tokenizer,
         ap=ap,
-        characters=c.characters if "characters" in c.keys() else None,
-        add_blank=c["add_blank"] if "add_blank" in c.keys() else False,
         batch_group_size=0,
-        min_seq_len=c.min_seq_len,
-        max_seq_len=c.max_seq_len,
+        min_text_len=c.min_text_len,
+        max_text_len=c.max_text_len,
+        min_audio_len=c.min_audio_len,
+        max_audio_len=c.max_audio_len,
         phoneme_cache_path=c.phoneme_cache_path,
-        use_phonemes=c.use_phonemes,
-        phoneme_language=c.phoneme_language,
-        enable_eos_bos=c.enable_eos_bos_chars,
+        precompute_num_workers=0,
         use_noise_augment=False,
         verbose=verbose,
         speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None,
@@ -44,7 +44,7 @@ def setup_loader(ap, r, verbose=False):
     if c.use_phonemes and c.compute_input_seq_cache:
         # precompute phonemes to have a better estimate of sequence lengths.
         dataset.compute_input_seq(c.num_loader_workers)
-    dataset.sort_and_filter_items(c.get("sort_by_audio_len", default=False))
+    dataset.preprocess_samples()
 
     loader = DataLoader(
         dataset,
@@ -75,8 +75,8 @@ def set_filename(wav_path, out_path):
 
 def format_data(data):
     # setup input data
-    text_input = data["text"]
-    text_lengths = data["text_lengths"]
+    text_input = data["token_id"]
+    text_lengths = data["token_id_lengths"]
     mel_input = data["mel"]
     mel_lengths = data["mel_lengths"]
     item_idx = data["item_idxs"]

From 3a15e2f88727cfbdcd6d50173ee0b2f925d98b04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 13:01:53 +0000
Subject: [PATCH 27/67] Update ljspeech download

---
 recipes/ljspeech/download_ljspeech.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/ljspeech/download_ljspeech.sh b/recipes/ljspeech/download_ljspeech.sh
index 14ef058da6..9468988a99 100644
--- a/recipes/ljspeech/download_ljspeech.sh
+++ b/recipes/ljspeech/download_ljspeech.sh
@@ -10,5 +10,5 @@ tar -xjf LJSpeech-1.1.tar.bz2
 shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
 head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
 tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
-mv LJSpeech-1.1 $RUN_DIR/
+mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/
 rm LJSpeech-1.1.tar.bz2
\ No newline at end of file

From 9338c7b6c4d5c75107e0b370603d1cb65111ce71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 13:02:02 +0000
Subject: [PATCH 28/67] Update pylintrc

---
 .pylintrc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.pylintrc b/.pylintrc
index 6e9f953edd..d5f9c4909c 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -168,7 +168,8 @@ disable=missing-docstring,
         exception-escape,
         comprehension-escape,
         duplicate-code,
-        not-callable
+        not-callable,
+        import-outside-toplevel
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

From 672d766906cfe47be801d934c6fcd8dc8d314873 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 8 Dec 2021 14:45:32 +0000
Subject: [PATCH 29/67] Update VCTK formatter

---
 TTS/tts/datasets/formatters.py | 37 +++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 1f23f85e6f..6dd91bc8c2 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -285,8 +285,10 @@ def brspeech(root_path, meta_file, ignored_speakers=None):
     return items
 
 
-def vctk(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):
-    """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
+def vctk(root_path, meta_files=None, wavs_path="wav22", mic="mic2"):
+    """https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"""
+    file_ext = 'flac'
+    test_speakers = meta_files
     items = []
     meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
     for meta_file in meta_files:
@@ -298,26 +300,33 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):
                 continue
         with open(meta_file, "r", encoding="utf-8") as file_text:
             text = file_text.readlines()[0]
-        wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
-        items.append([text, wav_file, "VCTK_" + speaker_id])
-
+        # p280 has no mic2 recordings
+        if speaker_id == "p280":
+            wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_mic1.{file_ext}")
+        else:
+            wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}")
+        if os.path.exists(wav_file):
+            items.append([text, wav_file, "VCTK_" + speaker_id])
+        else:
+            print(f" [!] wav files don't exist - {wav_file}")
     return items
 
 
-def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):  # pylint: disable=unused-argument
+def vctk_old(root_path, meta_files=None, wavs_path="wav48"):
     """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
+    test_speakers = meta_files
     items = []
-    txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
-    for text_file in txt_files:
-        _, speaker_id, txt_file = os.path.relpath(text_file, root_path).split(os.sep)
+    meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
+    for meta_file in meta_files:
+        _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
         file_id = txt_file.split(".")[0]
-        # ignore speakers
-        if isinstance(ignored_speakers, list):
-            if speaker_id in ignored_speakers:
+        if isinstance(test_speakers, list):  # if is list ignore this speakers ids
+            if speaker_id in test_speakers:
                 continue
+        with open(meta_file, "r", encoding="utf-8") as file_text:
+            text = file_text.readlines()[0]
         wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
-        items.append([None, wav_file, "VCTK_" + speaker_id])
-
+        items.append([text, wav_file, "VCTK_old_" + speaker_id])
     return items
 
 

From cecce069a8c32fa42c8f638a9bb693e731acb93e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 8 Dec 2021 14:45:57 +0000
Subject: [PATCH 30/67] Add file_ext args to resample.py

---
 TTS/bin/resample.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index 3c5ef29c21..c9f1166a64 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -26,6 +26,7 @@ def resample_file(func_args):
                                 --input_dir /root/LJSpeech-1.1/
                                 --output_sr 22050
                                 --output_dir /root/resampled_LJSpeech-1.1/
+                                --file_ext wav
                                 --n_jobs 24
                     """,
         formatter_class=RawTextHelpFormatter,
@@ -55,6 +56,14 @@ def resample_file(func_args):
         help="Path of the destination folder. If not defined, the operation is done in place",
     )
 
+    parser.add_argument(
+        "--file_ext",
+        type=str,
+        default="wav",
+        required=False,
+        help="Extension of the audio files to resample",
+    )
+
     parser.add_argument(
         "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
     )
@@ -67,7 +76,7 @@ def resample_file(func_args):
         args.input_dir = args.output_dir
 
     print("Resampling the audio files...")
-    audio_files = glob.glob(os.path.join(args.input_dir, "**/*.wav"), recursive=True)
+    audio_files = glob.glob(os.path.join(args.input_dir, f"**/*.{args.file_ext}"), recursive=True)
     print(f"Found {len(audio_files)} files...")
     audio_files = list(zip(audio_files, len(audio_files) * [args.output_sr]))
     with Pool(processes=args.n_jobs) as p:

From 95df38c66007a95849600259a80f6c0f006f8f74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 8 Dec 2021 15:15:56 +0000
Subject: [PATCH 31/67] Update VCTK recipes

---
 TTS/tts/datasets/formatters.py                |  2 +-
 .../speedy_speech/train_speedy_speech.py      |  6 ---
 recipes/vctk/fast_pitch/train_fast_pitch.py   | 43 +++++++++++------
 recipes/vctk/fast_speech/train_fast_speech.py | 48 +++++++++++--------
 recipes/vctk/glow_tts/train_glow_tts.py       | 41 +++++++++++-----
 .../vctk/speedy_speech/train_speedy_speech.py | 44 ++++++++++-------
 .../vctk/tacotron-DDC/train_tacotron-DDC.py   | 42 ++++++++++------
 .../vctk/tacotron2-DDC/train_tacotron2-ddc.py | 41 ++++++++++------
 recipes/vctk/tacotron2/train_tacotron2.py     | 41 ++++++++++------
 9 files changed, 192 insertions(+), 116 deletions(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 6dd91bc8c2..7e47c44d98 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -285,7 +285,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None):
     return items
 
 
-def vctk(root_path, meta_files=None, wavs_path="wav22", mic="mic2"):
+def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic2"):
     """https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"""
     file_ext = 'flac'
     test_speakers = meta_files
diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
index 468e8a5f12..2f8896c577 100644
--- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py
+++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@@ -68,12 +68,6 @@
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
-# init audio processor
-ap = AudioProcessor(**config.audio.to_dict())
-
-# load training samples
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
-
 # init model
 model = ForwardTTS(config, ap, tokenizer)
 
diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py
index f40587e091..f7a2ef068a 100644
--- a/recipes/vctk/fast_pitch/train_fast_pitch.py
+++ b/recipes/vctk/fast_pitch/train_fast_pitch.py
@@ -6,6 +6,7 @@
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.forward_tts import ForwardTTS
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -32,6 +33,7 @@
     num_loader_workers=8,
     num_eval_loader_workers=4,
     compute_input_seq_cache=True,
+    precompute_num_workers=4,
     compute_f0=True,
     f0_cache_path=os.path.join(output_path, "f0_cache"),
     run_eval=True,
@@ -39,23 +41,35 @@
     epochs=1000,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=False,
     phoneme_language="en-us",
     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
     print_step=50,
     print_eval=False,
     mixed_precision=False,
-    sort_by_audio_len=True,
-    max_seq_len=500000,
+    min_text_len=0,
+    max_text_len=500,
+    min_audio_len=0,
+    max_audio_len=500000,
     output_path=output_path,
     datasets=[dataset_config],
     use_speaker_embedding=True,
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio)
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init speaker manager for multi-speaker training
@@ -65,16 +79,15 @@
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 # init model
-model = ForwardTTS(config, speaker_manager)
+model = ForwardTTS(config, ap, tokenizer, speaker_manager=speaker_manager)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
+
+# AND... 3,2,1... 🚀
 trainer.fit()
+
diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py
index b29888099a..853bbb545f 100644
--- a/recipes/vctk/fast_speech/train_fast_speech.py
+++ b/recipes/vctk/fast_speech/train_fast_speech.py
@@ -6,6 +6,7 @@
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.forward_tts import ForwardTTS
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -25,37 +26,48 @@
 )
 
 config = FastSpeechConfig(
-    run_name="fast_pitch_ljspeech",
+    run_name="fast_speech_vctk",
     audio=audio_config,
     batch_size=32,
     eval_batch_size=16,
     num_loader_workers=8,
     num_eval_loader_workers=4,
     compute_input_seq_cache=True,
-    compute_f0=True,
-    f0_cache_path=os.path.join(output_path, "f0_cache"),
+    precompute_num_workers=4,
     run_eval=True,
     test_delay_epochs=-1,
     epochs=1000,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=False,
     phoneme_language="en-us",
     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
     print_step=50,
     print_eval=False,
     mixed_precision=False,
-    sort_by_audio_len=True,
-    max_seq_len=500000,
+    min_text_len=0,
+    max_text_len=500,
+    min_audio_len=0,
+    max_audio_len=500000,
     output_path=output_path,
     datasets=[dataset_config],
     use_speaker_embedding=True,
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio)
+## INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init speaker manager for multi-speaker training
@@ -65,16 +77,14 @@
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 # init model
-model = ForwardTTS(config, speaker_manager)
+model = ForwardTTS(config, ap, tokenizer, speaker_manager=speaker_manager)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
-trainer.fit()
+
+# AND... 3,2,1... 🚀
+trainer.fit()
\ No newline at end of file
diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py
index 8c9f538865..30050ef535 100644
--- a/recipes/vctk/glow_tts/train_glow_tts.py
+++ b/recipes/vctk/glow_tts/train_glow_tts.py
@@ -7,6 +7,7 @@
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.glow_tts import GlowTTS
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 # set experiment paths
@@ -32,6 +33,7 @@
     eval_batch_size=16,
     num_loader_workers=4,
     num_eval_loader_workers=4,
+    precompute_num_workers=4,
     run_eval=True,
     test_delay_epochs=-1,
     epochs=1000,
@@ -45,12 +47,27 @@
     output_path=output_path,
     datasets=[dataset_config],
     use_speaker_embedding=True,
+    min_text_len=0,
+    max_text_len=500,
+    min_audio_len=0,
+    max_audio_len=500000,
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio.to_dict())
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init speaker manager for multi-speaker training
@@ -60,16 +77,14 @@
 config.num_speakers = speaker_manager.num_speakers
 
 # init model
-model = GlowTTS(config, speaker_manager)
+model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
-trainer.fit()
+
+# AND... 3,2,1... 🚀
+trainer.fit()
\ No newline at end of file
diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py
index 81f78d265b..85e347fc07 100644
--- a/recipes/vctk/speedy_speech/train_speedy_speech.py
+++ b/recipes/vctk/speedy_speech/train_speedy_speech.py
@@ -6,6 +6,7 @@
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.forward_tts import ForwardTTS
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -32,30 +33,41 @@
     num_loader_workers=8,
     num_eval_loader_workers=4,
     compute_input_seq_cache=True,
-    compute_f0=True,
-    f0_cache_path=os.path.join(output_path, "f0_cache"),
+    precompute_num_workers=4,
     run_eval=True,
     test_delay_epochs=-1,
     epochs=1000,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=False,
     phoneme_language="en-us",
     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
     print_step=50,
     print_eval=False,
     mixed_precision=False,
-    sort_by_audio_len=True,
-    max_seq_len=500000,
+    min_text_len=0,
+    max_text_len=500,
+    min_audio_len=0,
+    max_audio_len=500000,
     output_path=output_path,
     datasets=[dataset_config],
     use_speaker_embedding=True,
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio)
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init speaker manager for multi-speaker training
@@ -65,16 +77,14 @@
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 # init model
-model = ForwardTTS(config, speaker_manager)
+model = ForwardTTS(config, ap, tokenizer, speaker_manager)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
+
+# AND... 3,2,1... 🚀
 trainer.fit()
diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
index b0030f1749..7960b34bc4 100644
--- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
+++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
@@ -7,6 +7,7 @@
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.tacotron import Tacotron
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -32,6 +33,7 @@
     eval_batch_size=16,
     num_loader_workers=4,
     num_eval_loader_workers=4,
+    precompute_num_workers=4,
     run_eval=True,
     test_delay_epochs=-1,
     r=6,
@@ -45,18 +47,30 @@
     print_step=25,
     print_eval=False,
     mixed_precision=True,
-    sort_by_audio_len=True,
-    min_seq_len=0,
-    max_seq_len=44000 * 10,  # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
+    min_text_len=0,
+    max_text_len=500,
+    min_audio_len=0,
+    max_audio_len=44000 * 10,  # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
     output_path=output_path,
     datasets=[dataset_config],
     use_speaker_embedding=True,  # set this to enable multi-sepeaker training
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio.to_dict())
+## INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init speaker manager for multi-speaker training
@@ -65,16 +79,14 @@
 speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
 
 # init model
-model = Tacotron(config, speaker_manager)
+model = Tacotron(config, ap, tokenizer, speaker_manager)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
+
+# AND... 3,2,1... 🚀
 trainer.fit()
diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
index 63efb78470..bc7951b572 100644
--- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
+++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
@@ -7,6 +7,7 @@
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.tacotron2 import Tacotron2
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -44,9 +45,10 @@
     print_step=150,
     print_eval=False,
     mixed_precision=True,
-    sort_by_audio_len=True,
-    min_seq_len=14800,
-    max_seq_len=22050 * 10,  # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
+    min_text_len=0,
+    max_text_len=500,
+    min_audio_len=0,
+    max_audio_len=44000 * 10,
     output_path=output_path,
     datasets=[dataset_config],
     use_speaker_embedding=True,  # set this to enable multi-sepeaker training
@@ -60,10 +62,21 @@
     lr=3e-5,
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio.to_dict())
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init speaker manager for multi-speaker training
@@ -72,16 +85,14 @@
 speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
 
 # init model
-model = Tacotron2(config, speaker_manager)
+model = Tacotron2(config, ap, tokenizer, speaker_manager)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
+
+# AND... 3,2,1... 🚀
 trainer.fit()
diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py
index 346d650b8f..82dedade77 100644
--- a/recipes/vctk/tacotron2/train_tacotron2.py
+++ b/recipes/vctk/tacotron2/train_tacotron2.py
@@ -7,6 +7,7 @@
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.tacotron2 import Tacotron2
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -44,9 +45,10 @@
     print_step=150,
     print_eval=False,
     mixed_precision=True,
-    sort_by_audio_len=True,
-    min_seq_len=14800,
-    max_seq_len=22050 * 10,  # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
+    min_text_len=0,
+    max_text_len=500,
+    min_audio_len=0,
+    max_audio_len=44000 * 10,
     output_path=output_path,
     datasets=[dataset_config],
     use_speaker_embedding=True,  # set this to enable multi-sepeaker training
@@ -60,10 +62,21 @@
     lr=3e-5,
 )
 
-# init audio processor
-ap = AudioProcessor(**config.audio.to_dict())
+## INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
 
-# load training samples
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 
 # init speaker manager for multi-speaker training
@@ -72,16 +85,14 @@
 speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
 
 # init model
-model = Tacotron2(config, speaker_manager)
+model = Tacotron2(config, ap, tokenizer, speaker_manager)
 
-# init the trainer and 🚀
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
 trainer = Trainer(
-    TrainingArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
+
+# AND... 3,2,1... 🚀
 trainer.fit()

From b4cbf2e62fb4b3d5e6fd00a7f83a08a41cdc957a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 8 Dec 2021 15:16:16 +0000
Subject: [PATCH 32/67] Fix `too many open files`

---
 TTS/tts/datasets/dataset.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 229f59c7a0..50fd97d997 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -11,6 +11,10 @@
 from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
 from TTS.utils.audio import AudioProcessor
 
+# to prevent too many open files error as suggested here
+# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
+torch.multiprocessing.set_sharing_strategy('file_system')
+
 
 def _parse_sample(item):
     language_name = None

From bbad03ed843c705bb8848e630303e1bac02db55f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 8 Dec 2021 15:18:14 +0000
Subject: [PATCH 33/67] Update recipes README.md

---
 recipes/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/recipes/README.md b/recipes/README.md
index cf3f3de94d..21a6727d8b 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -11,6 +11,12 @@ $ sh ./recipes/<dataset>/download_<dataset>.sh
 $ python recipes/<dataset>/<model_name>/train.py
 ```
 
+For some datasets you might need to resample the audio files. For example, VCTK dataset can be resampled to 22050Hz as follows.
+
+```console
+python TTS/bin/resample.py --input_dir recipes/vctk/VCTK/wav48_silence_trimmed --output_sr 22050 --output_dir recipes/vctk/VCTK/wav48_silence_trimmed --n_jobs 8 --file_ext flac
+```
+
 If you train a new model using TTS, feel free to share your training to expand the list of recipes.
 
 You can also open a new discussion and share your progress with the 🐸 community.
\ No newline at end of file

From 13a8f7151e2d2501d2a1ddee8a27482a23cd1c54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 7 Jan 2022 15:32:31 +0000
Subject: [PATCH 34/67] Delete `use_espeak_phonemes` from tests

---
 tests/tts_tests/test_vits_d-vectors_train.py    | 1 -
 tests/tts_tests/test_vits_multilingual_train.py | 1 -
 tests/tts_tests/test_vits_speaker_emb_train.py  | 1 -
 3 files changed, 3 deletions(-)

diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py
index 213669f50b..5fd9cbc1bd 100644
--- a/tests/tts_tests/test_vits_d-vectors_train.py
+++ b/tests/tts_tests/test_vits_d-vectors_train.py
@@ -16,7 +16,6 @@
     num_eval_loader_workers=0,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=True,
     phoneme_language="en-us",
     phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,
diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py
index 50cccca500..577db8a081 100644
--- a/tests/tts_tests/test_vits_multilingual_train.py
+++ b/tests/tts_tests/test_vits_multilingual_train.py
@@ -33,7 +33,6 @@
     num_eval_loader_workers=0,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=True,
     phoneme_language="en-us",
     phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,
diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py
index 6cc1dabd68..b9a1102e49 100644
--- a/tests/tts_tests/test_vits_speaker_emb_train.py
+++ b/tests/tts_tests/test_vits_speaker_emb_train.py
@@ -16,7 +16,6 @@
     num_eval_loader_workers=0,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    use_espeak_phonemes=True,
     phoneme_language="en-us",
     phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,

From 90fe858429fc36ad5c8bbd30c2aaee93710fa696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 7 Jan 2022 15:33:24 +0000
Subject: [PATCH 35/67] =?UTF-8?q?Fix=20synthesis.py=20=F0=9F=94=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 TTS/tts/utils/synthesis.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 47ea0e934c..72cd8403a8 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -175,8 +175,6 @@ def synthesis(
     text,
     CONFIG,
     use_cuda,
-    ap,
-    tokenizer,
     speaker_id=None,
     style_wav=None,
     use_griffin_lim=False,
@@ -232,10 +230,10 @@ def synthesis(
         if isinstance(style_wav, dict):
             style_mel = style_wav
         else:
-            style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda)
+            style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
     # convert text to sequence of token IDs
     text_inputs = np.asarray(
-        tokenizer.text_to_ids(text),
+        model.tokenizer.text_to_ids(text),
         dtype=np.int32,
     )
     # pass tensors to backend

From bddcc9d7cc9f26d3073d18a517e71292739ae403 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 7 Jan 2022 15:38:08 +0000
Subject: [PATCH 36/67] Fixes small compat. issues

---
 TTS/tts/datasets/__init__.py                  |  4 +--
 TTS/tts/datasets/dataset.py                   |  4 +--
 TTS/tts/datasets/formatters.py                |  2 +-
 TTS/tts/models/base_tts.py                    |  8 +++--
 TTS/tts/utils/languages.py                    |  9 ++++++
 TTS/tts/utils/speakers.py                     | 32 ++++++++++++-------
 recipes/vctk/fast_pitch/train_fast_pitch.py   |  1 -
 recipes/vctk/fast_speech/train_fast_speech.py |  2 +-
 recipes/vctk/glow_tts/train_glow_tts.py       |  2 +-
 tests/text_tests/test_phonemizer.py           |  9 +++---
 10 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index 4e8a2485db..40eed7e365 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -88,8 +88,8 @@ def load_tts_samples(
             meta_data_eval_all += meta_data_eval
         meta_data_train_all += meta_data_train
         # load attention masks for the duration predictor training
-        if d.meta_file_attn_mask:
-            meta_data = dict(load_attention_mask_meta_data(d["meta_file_attn_mask"]))
+        if dataset.meta_file_attn_mask:
+            meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
             for idx, ins in enumerate(meta_data_train_all):
                 attn_file = meta_data[ins[1]].strip()
                 meta_data_train_all[idx].append(attn_file)
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 50fd97d997..5fab71088d 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -13,7 +13,7 @@
 
 # to prevent too many open files error as suggested here
 # https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
-torch.multiprocessing.set_sharing_strategy('file_system')
+torch.multiprocessing.set_sharing_strategy("file_system")
 
 
 def _parse_sample(item):
@@ -208,7 +208,7 @@ def get_token_ids(self, idx, text):
     def load_data(self, idx):
         item = self.samples[idx]
 
-        text, wav_file, speaker_name, _, attn_file = _parse_sample(item)
+        text, wav_file, speaker_name, language_name, attn_file = _parse_sample(item)
         raw_text = text
 
         wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 7e47c44d98..68c07eaa11 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -287,7 +287,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None):
 
 def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic2"):
     """https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"""
-    file_ext = 'flac'
+    file_ext = "flac"
     test_speakers = meta_files
     items = []
     meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 5986232231..9a6a56df76 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -261,7 +261,7 @@ def get_data_loader(
                 speaker_id_mapping = None
                 d_vector_mapping = None
 
-            # setup custom symbols if needed
+            # setup multi-lingual attributes
             if hasattr(self, "language_manager"):
                 language_id_mapping = (
                     self.language_manager.language_id_mapping if self.args.use_language_embedding else None
@@ -290,6 +290,7 @@ def get_data_loader(
                 speaker_id_mapping=speaker_id_mapping,
                 d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
                 tokenizer=self.tokenizer,
+                language_id_mapping=language_id_mapping,
             )
 
             # wait all the DDP process to be ready
@@ -303,6 +304,7 @@ def get_data_loader(
             sampler = DistributedSampler(dataset) if num_gpus > 1 else None
 
             # Weighted samplers
+            # TODO: make this DDP amenable
             assert not (
                 num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False)
             ), "language_weighted_sampler is not supported with DistributedSampler"
@@ -313,10 +315,10 @@ def get_data_loader(
             if sampler is None:
                 if getattr(config, "use_language_weighted_sampler", False):
                     print(" > Using Language weighted sampler")
-                    sampler = get_language_weighted_sampler(dataset.items)
+                    sampler = get_language_weighted_sampler(dataset.samples)
                 elif getattr(config, "use_speaker_weighted_sampler", False):
                     print(" > Using Language weighted sampler")
-                    sampler = get_speaker_weighted_sampler(dataset.items)
+                    sampler = get_speaker_weighted_sampler(dataset.samples)
 
             loader = DataLoader(
                 dataset,
diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py
index fc7eec575e..5cecbe6908 100644
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@@ -98,6 +98,15 @@ def save_language_ids_to_file(self, file_path: str) -> None:
         """
         self._save_json(file_path, self.language_id_mapping)
 
+    @staticmethod
+    def init_from_config(config: Coqpit) -> "LanguageManager":
+        """Initialize the language manager from a Coqpit config.
+
+        Args:
+            config (Coqpit): Coqpit config.
+        """
+        return LanguageManager(config=config)
+
 
 def _set_file_path(path):
     """Find the language_ids.json under the given path or the above it.
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 9d2e6fe30c..7572e888d9 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -9,7 +9,7 @@
 from coqpit import Coqpit
 from torch.utils.data.sampler import WeightedRandomSampler
 
-from TTS.config import load_config
+from TTS.config import get_from_config_or_model_args_with_default, load_config
 from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model
 from TTS.utils.audio import AudioProcessor
 
@@ -331,19 +331,27 @@ def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] =
             SpeakerEncoder: Speaker encoder object.
         """
         speaker_manager = None
-        if hasattr(config, "use_speaker_embedding") and config.use_speaker_embedding:
+        if get_from_config_or_model_args_with_default(config, "use_speaker_embedding", False):
             if samples:
                 speaker_manager = SpeakerManager(data_items=samples)
-            if config.get("speaker_file", None):
-                speaker_manager = SpeakerManager(speaker_id_file_path=config.speaker_file)
-            if config.get("speakers_file", None):
-                speaker_manager = SpeakerManager(speaker_id_file_path=config.speakers_file)
-
-        if hasattr(config, "use_d_vector_file") and config.use_d_vector_file:
-            if config.get("speakers_file", None):
-                speaker_manager = SpeakerManager(d_vectors_file_path=config.speaker_file)
-            if config.get("d_vector_file", None):
-                speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
+            if get_from_config_or_model_args_with_default(config, "speaker_file", None):
+                speaker_manager = SpeakerManager(
+                    speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
+                )
+            if get_from_config_or_model_args_with_default(config, "speakers_file", None):
+                speaker_manager = SpeakerManager(
+                    speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speakers_file", None)
+                )
+
+        if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
+            if get_from_config_or_model_args_with_default(config, "speakers_file", None):
+                speaker_manager = SpeakerManager(
+                    d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
+                )
+            if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
+                speaker_manager = SpeakerManager(
+                    d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)
+                )
         return speaker_manager
 
 
diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py
index f7a2ef068a..4d9cc10d1f 100644
--- a/recipes/vctk/fast_pitch/train_fast_pitch.py
+++ b/recipes/vctk/fast_pitch/train_fast_pitch.py
@@ -90,4 +90,3 @@
 
 # AND... 3,2,1... 🚀
 trainer.fit()
-
diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py
index 853bbb545f..1dcab98285 100644
--- a/recipes/vctk/fast_speech/train_fast_speech.py
+++ b/recipes/vctk/fast_speech/train_fast_speech.py
@@ -87,4 +87,4 @@
 )
 
 # AND... 3,2,1... 🚀
-trainer.fit()
\ No newline at end of file
+trainer.fit()
diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py
index 30050ef535..e35e552db7 100644
--- a/recipes/vctk/glow_tts/train_glow_tts.py
+++ b/recipes/vctk/glow_tts/train_glow_tts.py
@@ -87,4 +87,4 @@
 )
 
 # AND... 3,2,1... 🚀
-trainer.fit()
\ No newline at end of file
+trainer.fit()
diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py
index 512cc195f3..9b619f6ea7 100644
--- a/tests/text_tests/test_phonemizer.py
+++ b/tests/text_tests/test_phonemizer.py
@@ -109,10 +109,11 @@ def test_is_available(self):
 class TestGruutPhonemizer(unittest.TestCase):
     def setUp(self):
         self.phonemizer = Gruut(language="en-us", use_espeak_phonemes=True, keep_stress=False)
-        self.EXPECTED_PHONEMES = ["ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ",
-                                  "f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s, ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ",
-                                  "ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l",
-                                  "f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ!"
+        self.EXPECTED_PHONEMES = [
+            "ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ",
+            "f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s, ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ",
+            "ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l",
+            "f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ!",
         ]
 
     def test_phonemize(self):

From c35b0c9014db302a0cd53740bf0bc095a299ee13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 7 Jan 2022 15:38:29 +0000
Subject: [PATCH 37/67] Update Vits for the new model API

---
 TTS/tts/models/vits.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 957994f989..83d2f9f92d 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -275,7 +275,12 @@ class Vits(BaseTTS):
     # pylint: disable=dangerous-default-value
 
     def __init__(
-        self, config: Coqpit, ap: "AudioProcessor", tokenizer: "TTSTokenizer", speaker_manager: SpeakerManager = None, language_manager: LanguageManager = None
+        self,
+        config: Coqpit,
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: SpeakerManager = None,
+        language_manager: LanguageManager = None,
     ):
 
         super().__init__(config, ap, tokenizer, speaker_manager)
@@ -284,8 +289,6 @@ def __init__(
         self.speaker_manager = speaker_manager
         self.language_manager = language_manager
 
-        self.args = args
-
         self.init_multispeaker(config)
         self.init_multilingual(config)
 
@@ -306,6 +309,7 @@ def __init__(
             self.args.num_layers_text_encoder,
             self.args.kernel_size_text_encoder,
             self.args.dropout_p_text_encoder,
+            language_emb_dim=self.embedded_language_dim,
         )
 
         self.posterior_encoder = PosteriorEncoder(
@@ -344,6 +348,7 @@ def __init__(
                 3,
                 self.args.dropout_p_duration_predictor,
                 cond_channels=self.embedded_speaker_dim,
+                language_emb_dim=self.embedded_language_dim,
             )
 
         self.waveform_decoder = HifiganGenerator(
@@ -878,7 +883,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         return self._log(self.ap, batch, outputs, "eval")
 
     @torch.no_grad()
-    def test_run(self) -> Tuple[Dict, Dict]:
+    def test_run(self, assets) -> Tuple[Dict, Dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -898,7 +903,7 @@ def test_run(self) -> Tuple[Dict, Dict]:
                     aux_inputs["text"],
                     self.config,
                     "cuda" in str(next(self.parameters()).device),
-                    ap,
+                    self.ap,
                     speaker_id=aux_inputs["speaker_id"],
                     d_vector=aux_inputs["d_vector"],
                     style_wav=aux_inputs["style_wav"],
@@ -1001,7 +1006,8 @@ def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]
         ap = AudioProcessor.init_from_config(config)
         tokenizer, new_config = TTSTokenizer.init_from_config(config)
         speaker_manager = SpeakerManager.init_from_config(config, samples)
-        return Vits(new_config, ap, tokenizer, speaker_manager)
+        language_manager = LanguageManager.init_from_config(config)
+        return Vits(new_config, ap, tokenizer, speaker_manager, language_manager)
 
 
 class VitsCharacters(BaseCharacters):

From b2e1420e6b4d787eeb56e2958b756211c7c04b52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 7 Jan 2022 15:38:57 +0000
Subject: [PATCH 38/67] Update train_tts for the new API

---
 TTS/bin/train_tts.py | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
index f053e9d75c..9a4a430a20 100644
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tts.py
@@ -42,36 +42,8 @@ def main():
     # load training samples
     train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True)
 
-    # setup audio processor
-    ap = AudioProcessor(**config.audio)
-
-    # init speaker manager
-    if check_config_and_model_args(config, "use_speaker_embedding", True):
-        speaker_manager = SpeakerManager(data_items=train_samples + eval_samples)
-        if hasattr(config, "model_args"):
-            config.model_args.num_speakers = speaker_manager.num_speakers
-        else:
-            config.num_speakers = speaker_manager.num_speakers
-    elif check_config_and_model_args(config, "use_d_vector_file", True):
-        speaker_manager = SpeakerManager(d_vectors_file_path=get_from_config_or_model_args(config, "d_vector_file"))
-        if hasattr(config, "model_args"):
-            config.model_args.num_speakers = speaker_manager.num_speakers
-        else:
-            config.num_speakers = speaker_manager.num_speakers
-    else:
-        speaker_manager = None
-
-    if hasattr(config, "use_language_embedding") and config.use_language_embedding:
-        language_manager = LanguageManager(config=config)
-        if hasattr(config, "model_args"):
-            config.model_args.num_languages = language_manager.num_languages
-        else:
-            config.num_languages = language_manager.num_languages
-    else:
-        language_manager = None
-
     # init the model from config
-    model = setup_model(config, speaker_manager, language_manager)
+    model = setup_model(config, train_samples + eval_samples)
 
     # init the trainer and 🚀
     trainer = Trainer(

From 83b6cf5876ca13205de72cfdb9c867f453aab704 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 12 Jan 2022 11:35:52 +0000
Subject: [PATCH 39/67] Extend glow_tts model tests

---
 TTS/tts/models/glow_tts.py       |  61 +++++--
 tests/tts_tests/test_glow_tts.py | 291 +++++++++++++++++++++++++++----
 2 files changed, 298 insertions(+), 54 deletions(-)

diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
index 3dd8d5c836..190c699e01 100644
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@@ -40,11 +40,20 @@ class GlowTTS(BaseTTS):
     Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments.
 
     Examples:
+        Init only model layers.
+
         >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
         >>> from TTS.tts.models.glow_tts import GlowTTS
-        >>> config = GlowTTSConfig()
+        >>> config = GlowTTSConfig(num_chars=2)
         >>> model = GlowTTS(config)
 
+        Fully init a model ready for action. All the class attributes and class members
+        (e.g Tokenizer, AudioProcessor, etc.). are initialized internally based on config values.
+
+        >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
+        >>> from TTS.tts.models.glow_tts import GlowTTS
+        >>> config = GlowTTSConfig()
+        >>> model = GlowTTS.init_from_config(config, verbose=False)
     """
 
     def __init__(
@@ -98,25 +107,23 @@ def __init__(
 
     def init_multispeaker(self, config: Coqpit):
         """Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding
-        vector dimension in the network. If model uses d-vectors, then it only sets the expected dimension.
+        vector dimension to the encoder layer channel size. If model uses d-vectors, then it only sets
+        speaker embedding vector dimension to the d-vector dimension from the config.
 
         Args:
             config (Coqpit): Model configuration.
         """
         self.embedded_speaker_dim = 0
-        # init speaker manager
-        if self.speaker_manager is None and (self.use_speaker_embedding or self.use_d_vector_file):
-            raise ValueError(
-                " > SpeakerManager is not provided. You must provide the SpeakerManager before initializing a multi-speaker model."
-            )
         # set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager
         if self.speaker_manager is not None:
             self.num_speakers = self.speaker_manager.num_speakers
         # set ultimate speaker embedding size
-        if config.use_speaker_embedding or config.use_d_vector_file:
+        if config.use_d_vector_file:
             self.embedded_speaker_dim = (
                 config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
             )
+            if self.speaker_manager is not None:
+                assert config.d_vector_dim == self.speaker_manager.d_vector_dim, " [!] d-vector dimension mismatch b/w config and speaker manager."
         # init speaker embedding layer
         if config.use_speaker_embedding and not config.use_d_vector_file:
             print(" > Init speaker_embedding layer.")
@@ -184,12 +191,33 @@ def forward(
         self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
     ):  # pylint: disable=dangerous-default-value
         """
-        Shapes:
-            - x: :math:`[B, T]`
-            - x_lenghts::math:`B`
-            - y: :math:`[B, T, C]`
-            - y_lengths::math:`B`
-            - g: :math:`[B, C] or B`
+        Args:
+            x (torch.Tensor):
+                Input text sequence ids. :math:`[B, T_en]`
+
+            x_lengths (torch.Tensor):
+                Lengths of input text sequences. :math:`[B]`
+
+            y (torch.Tensor):
+                Target mel-spectrogram frames. :math:`[B, T_de, C_mel]`
+
+            y_lengths (torch.Tensor):
+                Lengths of target mel-spectrogram frames. :math:`[B]`
+
+            aux_input (Dict):
+                Auxiliary inputs. `d_vectors` is speaker embedding vectors for a multi-speaker model.
+                :math:`[B, D_vec]`. `speaker_ids` is speaker ids for a multi-speaker model usind speaker-embedding
+                layer. :math:`B`
+
+        Returns:
+            Dict:
+                - z: :math: `[B, T_de, C]`
+                - logdet: :math:`B`
+                - y_mean: :math:`[B, T_de, C]`
+                - y_log_scale: :math:`[B, T_de, C]`
+                - alignments: :math:`[B, T_en, T_de]`
+                - durations_log: :math:`[B, T_en, 1]`
+                - total_durations_log: :math:`[B, T_en, 1]`
         """
         # [B, T, C] -> [B, C, T]
         y = y.transpose(1, 2)
@@ -508,17 +536,18 @@ def on_train_step_start(self, trainer):
         self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps
 
     @staticmethod
-    def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
         """Initiate model from config
 
         Args:
             config (VitsConfig): Model config.
             samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
                 Defaults to None.
+            verbose (bool): If True, print init messages. Defaults to True.
         """
         from TTS.utils.audio import AudioProcessor
 
-        ap = AudioProcessor.init_from_config(config)
+        ap = AudioProcessor.init_from_config(config, verbose)
         tokenizer, new_config = TTSTokenizer.init_from_config(config)
         speaker_manager = SpeakerManager.init_from_config(config, samples)
         return GlowTTS(new_config, ap, tokenizer, speaker_manager)
diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py
index 82d0ec3b78..e97b793a67 100644
--- a/tests/tts_tests/test_glow_tts.py
+++ b/tests/tts_tests/test_glow_tts.py
@@ -1,11 +1,13 @@
 import copy
 import os
 import unittest
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.utils.logging.tensorboard_logger import TensorboardLogger
 
 import torch
 from torch import optim
 
-from tests import get_tests_input_path
+from tests import get_tests_data_path, get_tests_input_path, get_tests_output_path
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 from TTS.tts.layers.losses import GlowTTSLoss
 from TTS.tts.models.glow_tts import GlowTTS
@@ -28,36 +30,211 @@ def count_parameters(model):
     return sum(p.numel() for p in model.parameters() if p.requires_grad)
 
 
-class GlowTTSTrainTest(unittest.TestCase):
-    @staticmethod
-    def test_train_step():
+class TestGlowTTS(unittest.TestCase):
+    def _create_inputs(self):
         input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
         input_lengths = torch.randint(100, 129, (8,)).long().to(device)
         input_lengths[-1] = 128
         mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
         mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
         speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
+        return input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
 
-        criterion = GlowTTSLoss()
+    def _check_parameter_changes(self, model, model_ref):
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
+                count, param.shape, param, param_ref
+            )
+            count += 1
 
-        # model to train
+    def test_init_multispeaker(self):
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS(config)
+        # speaker embedding with default speaker_embedding_dim
+        config.use_speaker_embedding = True
+        config.num_speakers = 5
+        config.d_vector_dim = None
+        model.init_multispeaker(config)
+        self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
+        # use external speaker embeddings with speaker_embedding_dim = 301
+        config = GlowTTSConfig(num_chars=32)
+        config.use_d_vector_file = True
+        config.d_vector_dim = 301
+        model = GlowTTS(config)
+        model.init_multispeaker(config)
+        self.assertEqual(model.c_in_channels, 301)
+        # use speaker embedddings by the provided speaker_manager
+        config = GlowTTSConfig(num_chars=32)
+        config.use_speaker_embedding = True
+        config.speakers_file = os.path.join(get_tests_data_path(), "ljspeech", "speakers.json")
+        speaker_manager = SpeakerManager.init_from_config(config)
+        model = GlowTTS(config)
+        model.speaker_manager = speaker_manager
+        model.init_multispeaker(config)
+        self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
+        self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
+        # use external speaker embeddings by the provided speaker_manager
+        config = GlowTTSConfig(num_chars=32)
+        config.use_d_vector_file = True
+        config.d_vector_dim = 256
+        config.d_vector_file = os.path.join(get_tests_data_path(), "dummy_speakers.json")
+        speaker_manager = SpeakerManager.init_from_config(config)
+        model = GlowTTS(config)
+        model.speaker_manager = speaker_manager
+        model.init_multispeaker(config)
+        self.assertEqual(model.c_in_channels, speaker_manager.d_vector_dim)
+        self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
+
+    def test_unlock_act_norm_layers(self):
         config = GlowTTSConfig(num_chars=32)
         model = GlowTTS(config).to(device)
+        model.unlock_act_norm_layers()
+        for f in model.decoder.flows:
+            if getattr(f, "set_ddi", False):
+                self.assertFalse(f.initialized)
 
-        # reference model to compare model weights
-        model_ref = GlowTTS(config).to(device)
+    def test_lock_act_norm_layers(self):
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS(config).to(device)
+        model.lock_act_norm_layers()
+        for f in model.decoder.flows:
+            if getattr(f, "set_ddi", False):
+                self.assertTrue(f.initialized)
+
+    def test_forward(self):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+        # create model
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS(config).to(device)
+        model.train()
+        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        # inference encoder and decoder with MAS
+        y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths)
+        self.assertEqual(y["z"].shape, mel_spec.shape)
+        self.assertEqual(y["logdet"].shape, torch.Size([8]))
+        self.assertEqual(y["y_mean"].shape, mel_spec.shape)
+        self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
+        self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],))
+        self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,))
+        self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,))
+
+    def test_forward_with_d_vector(self):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+        d_vector = torch.rand(8, 256).to(device)
+        # create model
+        config = GlowTTSConfig(
+            num_chars=32,
+            use_d_vector_file=True,
+            d_vector_dim=256,
+            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+        )
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        model.train()
+        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        # inference encoder and decoder with MAS
+        y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector})
+        self.assertEqual(y["z"].shape, mel_spec.shape)
+        self.assertEqual(y["logdet"].shape, torch.Size([8]))
+        self.assertEqual(y["y_mean"].shape, mel_spec.shape)
+        self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
+        self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],))
+        self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,))
+        self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,))
 
+    def test_forward_with_speaker_id(self):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+        speaker_ids = torch.randint(0, 24, (8,)).long().to(device)
+        # create model
+        config = GlowTTSConfig(
+            num_chars=32,
+            use_speaker_embedding=True,
+            num_speakers=24,
+        )
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
         model.train()
         print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        # inference encoder and decoder with MAS
+        y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"speaker_ids": speaker_ids})
+        self.assertEqual(y["z"].shape, mel_spec.shape)
+        self.assertEqual(y["logdet"].shape, torch.Size([8]))
+        self.assertEqual(y["y_mean"].shape, mel_spec.shape)
+        self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
+        self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],))
+        self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,))
+        self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,))
+
+    def _assert_inference_outputs(self, outputs, input_dummy, mel_spec):
+        output_shape = outputs["model_outputs"].shape
+        self.assertEqual(outputs["model_outputs"].shape[::2] , mel_spec.shape[::2])
+        self.assertEqual(outputs["logdet"], None)
+        self.assertEqual(outputs["y_mean"].shape, output_shape)
+        self.assertEqual(outputs["y_log_scale"].shape, output_shape)
+        self.assertEqual(outputs["alignments"].shape, output_shape[:2] + (input_dummy.shape[1],))
+        self.assertEqual(outputs["durations_log"].shape, input_dummy.shape + (1,))
+        self.assertEqual(outputs["total_durations_log"].shape, input_dummy.shape + (1,))
+
+    def test_inference(self):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS(config).to(device)
+        model.eval()
+        outputs = model.inference(input_dummy, {"x_lengths": input_lengths})
+        self._assert_inference_outputs(outputs, input_dummy, mel_spec)
+
+    def test_inference_with_d_vector(self):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+        d_vector = torch.rand(8, 256).to(device)
+        config = GlowTTSConfig(num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"))
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        model.eval()
+        outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "d_vectors": d_vector})
+        self._assert_inference_outputs(outputs, input_dummy, mel_spec)
+
+    def test_inference_with_speaker_ids(self):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+        speaker_ids = torch.randint(0, 24, (8,)).long().to(device)
+        # create model
+        config = GlowTTSConfig(
+            num_chars=32,
+            use_speaker_embedding=True,
+            num_speakers=24,
+        )
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "speaker_ids": speaker_ids})
+        self._assert_inference_outputs(outputs, input_dummy, mel_spec)
+
+    def test_inference_with_MAS(self):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+        # create model
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS(config).to(device)
+        model.eval()
+        # inference encoder and decoder with MAS
+        y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths)
+        y2 = model.decoder_inference(mel_spec, mel_lengths)
+        assert (
+            y2["model_outputs"].shape == y["model_outputs"].shape
+        ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
+            y["model_outputs"].shape, y2["model_outputs"].shape
+        )
 
+    def test_train_step(self):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+        criterion = GlowTTSLoss()
+        # model to train
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS(config).to(device)
+        # reference model to compare model weights
+        model_ref = GlowTTS(config).to(device)
+        model.train()
+        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
         # pass the state to ref model
         model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
-
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             assert (param - param_ref).sum() == 0, param
             count += 1
-
         optimizer = optim.Adam(model.parameters(), lr=0.001)
         for _ in range(5):
             optimizer.zero_grad()
@@ -75,40 +252,78 @@ def test_train_step():
             loss = loss_dict["loss"]
             loss.backward()
             optimizer.step()
-
         # check parameter changes
-        count = 0
-        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
-            )
-            count += 1
+        self._check_parameter_changes(model, model_ref)
+
+    def test_train_eval_log(self):
+        input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs()
+        batch = {}
+        batch["text_input"] = input_dummy
+        batch["text_lengths"] = input_lengths
+        batch["mel_lengths"] = mel_lengths
+        batch["mel_input"] = mel_spec
+        batch["d_vectors"] = None
+        batch["speaker_ids"] = None
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        model.run_data_dep_init = False
+        model.train()
+        logger = TensorboardLogger(log_dir=os.path.join(get_tests_output_path(), "dummy_glow_tts_logs"), model_name = "glow_tts_test_train_log")
+        criterion = model.get_criterion()
+        outputs, _ = model.train_step(batch, criterion)
+        model.train_log(batch, outputs, logger, None, 1)
+        model.eval_log(batch, outputs, logger, None, 1)
+        logger.finish()
 
+    def test_test_run(self):
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        model.run_data_dep_init = False
+        model.eval()
+        test_figures, test_audios = model.test_run(None)
+        self.assertTrue(test_figures is not None)
+        self.assertTrue(test_audios is not None)
 
-class GlowTTSInferenceTest(unittest.TestCase):
-    @staticmethod
-    def test_inference():
-        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
-        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
-        input_lengths[-1] = 128
-        mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
-        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
-        speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
+    def test_load_checkpoint(self):
+        chkp_path = os.path.join(get_tests_output_path(), "dummy_glow_tts_checkpoint.pth")
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        chkp = {}
+        chkp["model"] = model.state_dict()
+        torch.save(chkp, chkp_path)
+        model.load_checkpoint(config, chkp_path)
+        self.assertTrue(model.training)
+        model.load_checkpoint(config, chkp_path, eval=True)
+        self.assertFalse(model.training)
 
-        # create model
+    def test_get_criterion(self):
         config = GlowTTSConfig(num_chars=32)
-        model = GlowTTS(config).to(device)
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        criterion = model.get_criterion()
+        self.assertTrue(criterion is not None)
 
-        model.eval()
-        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+    def test_init_from_config(self):
+        config = GlowTTSConfig(num_chars=32)
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
 
-        # inference encoder and decoder with MAS
-        y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths)
+        config = GlowTTSConfig(num_chars=32, num_speakers=2)
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        self.assertTrue(model.num_speakers == 2)
+        self.assertTrue(not hasattr(model, "emb_g"))
 
-        y2 = model.decoder_inference(mel_spec, mel_lengths)
+        config = GlowTTSConfig(num_chars=32, num_speakers=2, use_speaker_embedding=True)
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        self.assertTrue(model.num_speakers == 2)
+        self.assertTrue(hasattr(model, "emb_g"))
+
+        config = GlowTTSConfig(num_chars=32, num_speakers=2, use_speaker_embedding=True, speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"))
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        self.assertTrue(model.num_speakers == 10)
+        self.assertTrue(hasattr(model, "emb_g"))
+
+        config = GlowTTSConfig(num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"))
+        model = GlowTTS.init_from_config(config, verbose=False).to(device)
+        self.assertTrue(model.num_speakers == 1)
+        self.assertTrue(not hasattr(model, "emb_g"))
+        self.assertTrue(model.c_in_channels == config.d_vector_dim)
 
-        assert (
-            y2["model_outputs"].shape == y["model_outputs"].shape
-        ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
-            y["model_outputs"].shape, y2["model_outputs"].shape
-        )

From 5a1d2dedca72248d0b5d75d97172ab87a22b6f39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 12 Jan 2022 11:36:09 +0000
Subject: [PATCH 40/67] Add verbose option to AudioProcessor

---
 TTS/utils/audio.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py
index 55ce49b508..e92acf574e 100644
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@@ -380,10 +380,10 @@ def __init__(
             self.symmetric_norm = None
 
     @staticmethod
-    def init_from_config(config: "Coqpit"):
+    def init_from_config(config: "Coqpit", verbose=True):
         if "audio" in config:
-            return AudioProcessor(**config.audio)
-        return AudioProcessor(**config)
+            return AudioProcessor(verbose=verbose, **config.audio)
+        return AudioProcessor(verbose=verbose, **config)
 
     ### setting up the parameters ###
     def _build_mel_basis(

From 3c9e5188a23d95b4ebdcb6001b9f5e836ada0aef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 12 Jan 2022 11:36:25 +0000
Subject: [PATCH 41/67] Fix tokenizer init_from_config

---
 TTS/tts/utils/text/tokenizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
index 3f416bbb86..f84a51eed8 100644
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@@ -146,8 +146,9 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
                 the config values. Defaults to None.
         """
         # init cleaners
+        text_cleaner = None
         if isinstance(config.text_cleaner, (str, list)):
-            text_cleaner = getattr(cleaners, config.text_cleaner)
+            text_cleaner = getattr(config, "text_cleaner")
 
         # init characters
         if characters is None:

From 9d9a5b33a082badb7f9a1e788031322332efa339 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 12 Jan 2022 11:36:46 +0000
Subject: [PATCH 42/67] Fix glow_tts_config missing field

---
 TTS/tts/configs/glow_tts_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py
index ce8eee6dfa..f42f3e5a51 100644
--- a/TTS/tts/configs/glow_tts_config.py
+++ b/TTS/tts/configs/glow_tts_config.py
@@ -153,6 +153,7 @@ class GlowTTSConfig(BaseTTSConfig):
 
     # multi-speaker settings
     use_speaker_embedding: bool = False
+    speakers_file: str = None
     use_d_vector_file: bool = False
     d_vector_file: str = False
 

From 79a5400e0ab894319bc0550e8ef900cd81f04544 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 12 Jan 2022 11:37:02 +0000
Subject: [PATCH 43/67] Add get_tests_data_path

---
 tests/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/__init__.py b/tests/__init__.py
index 0a0c3379c3..8906c8c796 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -26,6 +26,11 @@ def get_tests_input_path():
     return os.path.join(get_tests_path(), "inputs")
 
 
+def get_tests_data_path():
+    """Returns the path to the test data directory."""
+    return os.path.join(get_tests_path(), "data")
+
+
 def get_tests_output_path():
     """Returns the path to the directory for test outputs."""
     return os.path.join(get_tests_path(), "outputs")

From 09195786ec67a309ee2c58c007cbe937310945b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 12 Jan 2022 14:30:53 +0000
Subject: [PATCH 44/67] Make lint

---
 TTS/bin/train_tts.py             |  5 +----
 TTS/tts/datasets/formatters.py   |  3 +--
 TTS/tts/models/glow_tts.py       |  4 +++-
 TTS/tts/models/vits.py           | 14 ++++++++----
 TTS/tts/utils/synthesis.py       |  4 ----
 TTS/tts/utils/text/tokenizer.py  |  2 +-
 TTS/utils/synthesizer.py         |  9 ++------
 tests/tts_tests/test_glow_tts.py | 38 +++++++++++++++++++++++---------
 8 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
index 9a4a430a20..6477e75b99 100644
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tts.py
@@ -1,12 +1,9 @@
 import os
 
-from TTS.config import check_config_and_model_args, get_from_config_or_model_args, load_config, register_config
+from TTS.config import load_config, register_config
 from TTS.trainer import Trainer, TrainingArgs
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models import setup_model
-from TTS.tts.utils.languages import LanguageManager
-from TTS.tts.utils.speakers import SpeakerManager
-from TTS.utils.audio import AudioProcessor
 
 
 def main():
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 68c07eaa11..8000e783b2 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -285,10 +285,9 @@ def brspeech(root_path, meta_file, ignored_speakers=None):
     return items
 
 
-def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic2"):
+def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic2", ignored_speakers=None):
     """https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"""
     file_ext = "flac"
-    test_speakers = meta_files
     items = []
     meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
     for meta_file in meta_files:
diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
index 190c699e01..da7fca17c3 100644
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@@ -123,7 +123,9 @@ def init_multispeaker(self, config: Coqpit):
                 config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
             )
             if self.speaker_manager is not None:
-                assert config.d_vector_dim == self.speaker_manager.d_vector_dim, " [!] d-vector dimension mismatch b/w config and speaker manager."
+                assert (
+                    config.d_vector_dim == self.speaker_manager.d_vector_dim
+                ), " [!] d-vector dimension mismatch b/w config and speaker manager."
         # init speaker embedding layer
         if config.use_speaker_embedding and not config.use_d_vector_file:
             print(" > Init speaker_embedding layer.")
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 83d2f9f92d..df8abd8e16 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1,5 +1,4 @@
 import math
-import random
 from dataclasses import dataclass, field, replace
 from itertools import chain
 from typing import Dict, List, Tuple, Union
@@ -266,10 +265,20 @@ class Vits(BaseTTS):
     Check :class:`TTS.tts.configs.vits_config.VitsConfig` for class arguments.
 
     Examples:
+        Init only model layers.
+
         >>> from TTS.tts.configs.vits_config import VitsConfig
         >>> from TTS.tts.models.vits import Vits
         >>> config = VitsConfig()
         >>> model = Vits(config)
+
+        Fully init a model ready for action. All the class attributes and class members
+        (e.g Tokenizer, AudioProcessor, etc.). are initialized internally based on config values.
+
+        >>> from TTS.tts.configs.vits_config import VitsConfig
+        >>> from TTS.tts.models.vits import Vits
+        >>> config = VitsConfig()
+        >>> model = Vits.init_from_config(config)
     """
 
     # pylint: disable=dangerous-default-value
@@ -903,13 +912,10 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:
                     aux_inputs["text"],
                     self.config,
                     "cuda" in str(next(self.parameters()).device),
-                    self.ap,
                     speaker_id=aux_inputs["speaker_id"],
                     d_vector=aux_inputs["d_vector"],
                     style_wav=aux_inputs["style_wav"],
                     language_id=aux_inputs["language_id"],
-                    language_name=aux_inputs["language_name"],
-                    enable_eos_bos_chars=self.config.enable_eos_bos_chars,
                     use_griffin_lim=True,
                     do_trim_silence=False,
                 ).values()
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 72cd8403a8..6fed838205 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -181,7 +181,6 @@ def synthesis(
     do_trim_silence=False,
     d_vector=None,
     language_id=None,
-    language_name=None,
     backend="torch",
 ):
     """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
@@ -218,9 +217,6 @@ def synthesis(
         language_id (int):
             Language ID passed to the language embedding layer in multi-langual model. Defaults to None.
 
-        language_name (str):
-            Language name corresponding to the language code used by the phonemizer. Defaults to None.
-
         backend (str):
             tf or torch. Defaults to "torch".
     """
diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
index f84a51eed8..80be368d48 100644
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@@ -148,7 +148,7 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
         # init cleaners
         text_cleaner = None
         if isinstance(config.text_cleaner, (str, list)):
-            text_cleaner = getattr(config, "text_cleaner")
+            text_cleaner = getattr(cleaners, config.text_cleaner)
 
         # init characters
         if characters is None:
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index f6a1ae6ab1..a1a323e819 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -122,13 +122,9 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -
         speaker_manager = self._init_speaker_encoder(speaker_manager)
 
         if language_manager is not None:
-            self.tts_model = setup_tts_model(
-                config=self.tts_config,
-                speaker_manager=speaker_manager,
-                language_manager=language_manager,
-            )
+            self.tts_model = setup_tts_model(config=self.tts_config)
         else:
-            self.tts_model = setup_tts_model(config=self.tts_config, speaker_manager=speaker_manager)
+            self.tts_model = setup_tts_model(config=self.tts_config)
         self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
         if use_cuda:
             self.tts_model.cuda()
@@ -333,7 +329,6 @@ def tts(
                 use_cuda=self.use_cuda,
                 speaker_id=speaker_id,
                 language_id=language_id,
-                language_name=language_name,
                 style_wav=style_wav,
                 use_griffin_lim=use_gl,
                 d_vector=speaker_embedding,
diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py
index e97b793a67..e48977e9d4 100644
--- a/tests/tts_tests/test_glow_tts.py
+++ b/tests/tts_tests/test_glow_tts.py
@@ -1,8 +1,6 @@
 import copy
 import os
 import unittest
-from TTS.tts.utils.speakers import SpeakerManager
-from TTS.utils.logging.tensorboard_logger import TensorboardLogger
 
 import torch
 from torch import optim
@@ -11,7 +9,9 @@
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 from TTS.tts.layers.losses import GlowTTSLoss
 from TTS.tts.models.glow_tts import GlowTTS
+from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
+from TTS.utils.logging.tensorboard_logger import TensorboardLogger
 
 # pylint: disable=unused-variable
 
@@ -31,7 +31,8 @@ def count_parameters(model):
 
 
 class TestGlowTTS(unittest.TestCase):
-    def _create_inputs(self):
+    @staticmethod
+    def _create_inputs():
         input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
         input_lengths = torch.randint(100, 129, (8,)).long().to(device)
         input_lengths[-1] = 128
@@ -40,7 +41,8 @@ def _create_inputs(self):
         speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
         return input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
 
-    def _check_parameter_changes(self, model, model_ref):
+    @staticmethod
+    def _check_parameter_changes(model, model_ref):
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
@@ -166,7 +168,7 @@ def test_forward_with_speaker_id(self):
 
     def _assert_inference_outputs(self, outputs, input_dummy, mel_spec):
         output_shape = outputs["model_outputs"].shape
-        self.assertEqual(outputs["model_outputs"].shape[::2] , mel_spec.shape[::2])
+        self.assertEqual(outputs["model_outputs"].shape[::2], mel_spec.shape[::2])
         self.assertEqual(outputs["logdet"], None)
         self.assertEqual(outputs["y_mean"].shape, output_shape)
         self.assertEqual(outputs["y_log_scale"].shape, output_shape)
@@ -185,7 +187,12 @@ def test_inference(self):
     def test_inference_with_d_vector(self):
         input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
         d_vector = torch.rand(8, 256).to(device)
-        config = GlowTTSConfig(num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"))
+        config = GlowTTSConfig(
+            num_chars=32,
+            use_d_vector_file=True,
+            d_vector_dim=256,
+            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+        )
         model = GlowTTS.init_from_config(config, verbose=False).to(device)
         model.eval()
         outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "d_vectors": d_vector})
@@ -268,7 +275,9 @@ def test_train_eval_log(self):
         model = GlowTTS.init_from_config(config, verbose=False).to(device)
         model.run_data_dep_init = False
         model.train()
-        logger = TensorboardLogger(log_dir=os.path.join(get_tests_output_path(), "dummy_glow_tts_logs"), model_name = "glow_tts_test_train_log")
+        logger = TensorboardLogger(
+            log_dir=os.path.join(get_tests_output_path(), "dummy_glow_tts_logs"), model_name="glow_tts_test_train_log"
+        )
         criterion = model.get_criterion()
         outputs, _ = model.train_step(batch, criterion)
         model.train_log(batch, outputs, logger, None, 1)
@@ -316,14 +325,23 @@ def test_init_from_config(self):
         self.assertTrue(model.num_speakers == 2)
         self.assertTrue(hasattr(model, "emb_g"))
 
-        config = GlowTTSConfig(num_chars=32, num_speakers=2, use_speaker_embedding=True, speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"))
+        config = GlowTTSConfig(
+            num_chars=32,
+            num_speakers=2,
+            use_speaker_embedding=True,
+            speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"),
+        )
         model = GlowTTS.init_from_config(config, verbose=False).to(device)
         self.assertTrue(model.num_speakers == 10)
         self.assertTrue(hasattr(model, "emb_g"))
 
-        config = GlowTTSConfig(num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"))
+        config = GlowTTSConfig(
+            num_chars=32,
+            use_d_vector_file=True,
+            d_vector_dim=256,
+            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+        )
         model = GlowTTS.init_from_config(config, verbose=False).to(device)
         self.assertTrue(model.num_speakers == 1)
         self.assertTrue(not hasattr(model, "emb_g"))
         self.assertTrue(model.c_in_channels == config.d_vector_dim)
-

From 26be609cfaaa83250100b62001c9efe9d05ccb58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 13 Jan 2022 17:39:06 +0000
Subject: [PATCH 45/67] Extend unittests

---
 TTS/tts/layers/vits/networks.py  |   9 +-
 TTS/tts/models/vits.py           |  46 ++++-
 tests/tts_tests/test_glow_tts.py |  89 ++++++----
 tests/tts_tests/test_vits.py     | 285 +++++++++++++++++++++++++++----
 4 files changed, 361 insertions(+), 68 deletions(-)

diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py
index ef426ace5c..b6497c78fa 100644
--- a/TTS/tts/layers/vits/networks.py
+++ b/TTS/tts/layers/vits/networks.py
@@ -83,6 +83,7 @@ def forward(self, x, x_lengths, lang_emb=None):
             - x: :math:`[B, T]`
             - x_length: :math:`[B]`
         """
+        assert x.shape[0] == x_lengths.shape[0]
         x = self.emb(x) * math.sqrt(self.hidden_channels)  # [b, t, h]
 
         # concat the lang emb in embedding chars
@@ -90,7 +91,7 @@ def forward(self, x, x_lengths, lang_emb=None):
             x = torch.cat((x, lang_emb.transpose(2, 1).expand(x.size(0), x.size(1), -1)), dim=-1)
 
         x = torch.transpose(x, 1, -1)  # [b, h, t]
-        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)  # [b, 1, t]
 
         x = self.encoder(x * x_mask, x_mask)
         stats = self.proj(x) * x_mask
@@ -136,6 +137,9 @@ def __init__(
 
     def forward(self, x, x_mask, g=None, reverse=False):
         """
+        Note:
+            Set `reverse` to True for inference.
+
         Shapes:
             - x: :math:`[B, C, T]`
             - x_mask: :math:`[B, 1, T]`
@@ -209,6 +213,9 @@ def __init__(
 
     def forward(self, x, x_mask, g=None, reverse=False):
         """
+        Note:
+            Set `reverse` to True for inference.
+
         Shapes:
             - x: :math:`[B, C, T]`
             - x_mask: :math:`[B, 1, T]`
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index df8abd8e16..34cb69c8a7 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -568,6 +568,19 @@ def forward(
             - d_vectors: :math:`[B, C, 1]`
             - speaker_ids: :math:`[B]`
             - language_ids: :math:`[B]`
+
+        Return Shapes:
+            - model_outputs: :math:`[B, 1, T_wav]`
+            - alignments: :math:`[B, T_seq, T_dec]`
+            - z: :math:`[B, C, T_dec]`
+            - z_p: :math:`[B, C, T_dec]`
+            - m_p: :math:`[B, C, T_dec]`
+            - logs_p: :math:`[B, C, T_dec]`
+            - m_q: :math:`[B, C, T_dec]`
+            - logs_q: :math:`[B, C, T_dec]`
+            - waveform_seg: :math:`[B, 1, spec_seg_size * hop_length]`
+            - gt_spk_emb: :math:`[B, 1, speaker_encoder.proj_dim]`
+            - syn_spk_emb: :math:`[B, 1, speaker_encoder.proj_dim]`
         """
         outputs = {}
         sid, g, lid = self._set_cond_input(aux_input)
@@ -668,15 +681,33 @@ def forward(
         )
         return outputs
 
-    def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}):
+    @staticmethod
+    def _set_x_lengths(x, aux_input):
+        if "x_lengths" in aux_input and aux_input["x_lengths"] is not None:
+            return aux_input["x_lengths"]
+        return torch.tensor(x.shape[1:2]).to(x.device)
+
+    def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None}):
         """
+        Note:
+            To run in batch mode, provide `x_lengths` else model assumes that the batch size is 1.
+
         Shapes:
             - x: :math:`[B, T_seq]`
-            - d_vectors: :math:`[B, C, 1]`
+            - x_lengths: :math:`[B]`
+            - d_vectors: :math:`[B, C]`
             - speaker_ids: :math:`[B]`
+
+        Return Shapes:
+            - model_outputs: :math:`[B, 1, T_wav]`
+            - alignments: :math:`[B, T_seq, T_dec]`
+            - z: :math:`[B, C, T_dec]`
+            - z_p: :math:`[B, C, T_dec]`
+            - m_p: :math:`[B, C, T_dec]`
+            - logs_p: :math:`[B, C, T_dec]`
         """
         sid, g, lid = self._set_cond_input(aux_input)
-        x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
+        x_lengths = self._set_x_lengths(x, aux_input)
 
         # speaker embedding
         if self.args.use_speaker_embedding and sid is not None:
@@ -699,8 +730,9 @@ def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "langu
         w = torch.exp(logw) * x_mask * self.length_scale
         w_ceil = torch.ceil(w)
         y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
-        y_mask = sequence_mask(y_lengths, None).to(x_mask.dtype)
-        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+        y_mask = sequence_mask(y_lengths, None).to(x_mask.dtype).unsqueeze(1)  # [B, 1, T_dec]
+
+        attn_mask = x_mask * y_mask.transpose(1, 2)  # [B, 1, T_enc] * [B, T_dec, 1]
         attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1).transpose(1, 2))
 
         m_p = torch.matmul(attn.transpose(1, 2), m_p.transpose(1, 2)).transpose(1, 2)
@@ -999,7 +1031,7 @@ def load_checkpoint(
             assert not self.training
 
     @staticmethod
-    def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
         """Initiate model from config
 
         Args:
@@ -1009,7 +1041,7 @@ def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]
         """
         from TTS.utils.audio import AudioProcessor
 
-        ap = AudioProcessor.init_from_config(config)
+        ap = AudioProcessor.init_from_config(config, verbose=verbose)
         tokenizer, new_config = TTSTokenizer.init_from_config(config)
         speaker_manager = SpeakerManager.init_from_config(config, samples)
         language_manager = LanguageManager.init_from_config(config)
diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py
index e48977e9d4..305f86b896 100644
--- a/tests/tts_tests/test_glow_tts.py
+++ b/tests/tts_tests/test_glow_tts.py
@@ -23,6 +23,7 @@
 
 ap = AudioProcessor(**c.audio)
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
+BATCH_SIZE = 3
 
 
 def count_parameters(model):
@@ -32,13 +33,13 @@ def count_parameters(model):
 
 class TestGlowTTS(unittest.TestCase):
     @staticmethod
-    def _create_inputs():
-        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
-        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
+    def _create_inputs(batch_size=8):
+        input_dummy = torch.randint(0, 24, (batch_size, 128)).long().to(device)
+        input_lengths = torch.randint(100, 129, (batch_size,)).long().to(device)
         input_lengths[-1] = 128
-        mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
-        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
-        speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
+        mel_spec = torch.rand(batch_size, 30, c.audio["num_mels"]).to(device)
+        mel_lengths = torch.randint(20, 30, (batch_size,)).long().to(device)
+        speaker_ids = torch.randint(0, 5, (batch_size,)).long().to(device)
         return input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
 
     @staticmethod
@@ -104,8 +105,8 @@ def test_lock_act_norm_layers(self):
             if getattr(f, "set_ddi", False):
                 self.assertTrue(f.initialized)
 
-    def test_forward(self):
-        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+    def _test_forward(self, batch_size):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size)
         # create model
         config = GlowTTSConfig(num_chars=32)
         model = GlowTTS(config).to(device)
@@ -114,16 +115,20 @@ def test_forward(self):
         # inference encoder and decoder with MAS
         y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths)
         self.assertEqual(y["z"].shape, mel_spec.shape)
-        self.assertEqual(y["logdet"].shape, torch.Size([8]))
+        self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
         self.assertEqual(y["y_mean"].shape, mel_spec.shape)
         self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
         self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],))
         self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,))
         self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,))
 
-    def test_forward_with_d_vector(self):
-        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
-        d_vector = torch.rand(8, 256).to(device)
+    def test_forward(self):
+        self._test_forward(1)
+        self._test_forward(3)
+
+    def _test_forward_with_d_vector(self, batch_size):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size)
+        d_vector = torch.rand(batch_size, 256).to(device)
         # create model
         config = GlowTTSConfig(
             num_chars=32,
@@ -137,16 +142,20 @@ def test_forward_with_d_vector(self):
         # inference encoder and decoder with MAS
         y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector})
         self.assertEqual(y["z"].shape, mel_spec.shape)
-        self.assertEqual(y["logdet"].shape, torch.Size([8]))
+        self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
         self.assertEqual(y["y_mean"].shape, mel_spec.shape)
         self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
         self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],))
         self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,))
         self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,))
 
-    def test_forward_with_speaker_id(self):
-        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
-        speaker_ids = torch.randint(0, 24, (8,)).long().to(device)
+    def test_forward_with_d_vector(self):
+        self._test_forward_with_d_vector(1)
+        self._test_forward_with_d_vector(3)
+
+    def _test_forward_with_speaker_id(self, batch_size):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size)
+        speaker_ids = torch.randint(0, 24, (batch_size,)).long().to(device)
         # create model
         config = GlowTTSConfig(
             num_chars=32,
@@ -159,13 +168,17 @@ def test_forward_with_speaker_id(self):
         # inference encoder and decoder with MAS
         y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"speaker_ids": speaker_ids})
         self.assertEqual(y["z"].shape, mel_spec.shape)
-        self.assertEqual(y["logdet"].shape, torch.Size([8]))
+        self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
         self.assertEqual(y["y_mean"].shape, mel_spec.shape)
         self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
         self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],))
         self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,))
         self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,))
 
+    def test_forward_with_speaker_id(self):
+        self._test_forward_with_speaker_id(1)
+        self._test_forward_with_speaker_id(3)
+
     def _assert_inference_outputs(self, outputs, input_dummy, mel_spec):
         output_shape = outputs["model_outputs"].shape
         self.assertEqual(outputs["model_outputs"].shape[::2], mel_spec.shape[::2])
@@ -176,17 +189,21 @@ def _assert_inference_outputs(self, outputs, input_dummy, mel_spec):
         self.assertEqual(outputs["durations_log"].shape, input_dummy.shape + (1,))
         self.assertEqual(outputs["total_durations_log"].shape, input_dummy.shape + (1,))
 
-    def test_inference(self):
-        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+    def _test_inference(self, batch_size):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size)
         config = GlowTTSConfig(num_chars=32)
         model = GlowTTS(config).to(device)
         model.eval()
         outputs = model.inference(input_dummy, {"x_lengths": input_lengths})
         self._assert_inference_outputs(outputs, input_dummy, mel_spec)
 
-    def test_inference_with_d_vector(self):
-        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
-        d_vector = torch.rand(8, 256).to(device)
+    def test_inference(self):
+        self._test_inference(1)
+        self._test_inference(3)
+
+    def _test_inference_with_d_vector(self, batch_size):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size)
+        d_vector = torch.rand(batch_size, 256).to(device)
         config = GlowTTSConfig(
             num_chars=32,
             use_d_vector_file=True,
@@ -198,9 +215,13 @@ def test_inference_with_d_vector(self):
         outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "d_vectors": d_vector})
         self._assert_inference_outputs(outputs, input_dummy, mel_spec)
 
-    def test_inference_with_speaker_ids(self):
-        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
-        speaker_ids = torch.randint(0, 24, (8,)).long().to(device)
+    def test_inference_with_d_vector(self):
+        self._test_inference_with_d_vector(1)
+        self._test_inference_with_d_vector(3)
+
+    def _test_inference_with_speaker_ids(self, batch_size):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size)
+        speaker_ids = torch.randint(0, 24, (batch_size,)).long().to(device)
         # create model
         config = GlowTTSConfig(
             num_chars=32,
@@ -211,8 +232,12 @@ def test_inference_with_speaker_ids(self):
         outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "speaker_ids": speaker_ids})
         self._assert_inference_outputs(outputs, input_dummy, mel_spec)
 
-    def test_inference_with_MAS(self):
-        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+    def test_inference_with_speaker_ids(self):
+        self._test_inference_with_speaker_ids(1)
+        self._test_inference_with_speaker_ids(3)
+
+    def _test_inference_with_MAS(self, batch_size):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size)
         # create model
         config = GlowTTSConfig(num_chars=32)
         model = GlowTTS(config).to(device)
@@ -226,8 +251,13 @@ def test_inference_with_MAS(self):
             y["model_outputs"].shape, y2["model_outputs"].shape
         )
 
+    def test_inference_with_MAS(self):
+        self._test_inference_with_MAS(1)
+        self._test_inference_with_MAS(3)
+
     def test_train_step(self):
-        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs()
+        batch_size = BATCH_SIZE
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size)
         criterion = GlowTTSLoss()
         # model to train
         config = GlowTTSConfig(num_chars=32)
@@ -263,7 +293,8 @@ def test_train_step(self):
         self._check_parameter_changes(model, model_ref)
 
     def test_train_eval_log(self):
-        input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs()
+        batch_size = BATCH_SIZE
+        input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs(batch_size)
         batch = {}
         batch["text_input"] = input_dummy
         batch["text_lengths"] = input_lengths
diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
index 4274d9479b..53e7c09e98 100644
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@@ -1,9 +1,11 @@
+import copy
 import os
 import unittest
+from TTS.utils.logging.tensorboard_logger import TensorboardLogger
 
 import torch
 
-from tests import assertHasAttr, assertHasNotAttr, get_tests_input_path
+from tests import assertHasAttr, assertHasNotAttr, get_tests_data_path, get_tests_input_path, get_tests_output_path
 from TTS.config import load_config
 from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model
 from TTS.tts.configs.vits_config import VitsConfig
@@ -100,35 +102,35 @@ def test_voice_conversion(self):
         self.assertEqual(z_p.shape, (1, args.hidden_channels, spec_len))
         self.assertEqual(z_hat.shape, (1, args.hidden_channels, spec_len))
 
-    def _init_inputs(self, config):
-        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
-        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
+    def _create_inputs(self, config, batch_size=2):
+        input_dummy = torch.randint(0, 24, (batch_size, 128)).long().to(device)
+        input_lengths = torch.randint(100, 129, (batch_size,)).long().to(device)
         input_lengths[-1] = 128
-        spec = torch.rand(8, config.audio["fft_size"] // 2 + 1, 30).to(device)
-        spec_lengths = torch.randint(20, 30, (8,)).long().to(device)
+        spec = torch.rand(batch_size, config.audio["fft_size"] // 2 + 1, 30).to(device)
+        spec_lengths = torch.randint(20, 30, (batch_size,)).long().to(device)
         spec_lengths[-1] = spec.size(2)
-        waveform = torch.rand(8, 1, spec.size(2) * config.audio["hop_length"]).to(device)
+        waveform = torch.rand(batch_size, 1, spec.size(2) * config.audio["hop_length"]).to(device)
         return input_dummy, input_lengths, spec, spec_lengths, waveform
 
-    def _check_forward_outputs(self, config, output_dict, encoder_config=None):
+    def _check_forward_outputs(self, config, output_dict, encoder_config=None, batch_size=2):
         self.assertEqual(
             output_dict["model_outputs"].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"]
         )
-        self.assertEqual(output_dict["alignments"].shape, (8, 128, 30))
+        self.assertEqual(output_dict["alignments"].shape, (batch_size, 128, 30))
         self.assertEqual(output_dict["alignments"].max(), 1)
         self.assertEqual(output_dict["alignments"].min(), 0)
-        self.assertEqual(output_dict["z"].shape, (8, config.model_args.hidden_channels, 30))
-        self.assertEqual(output_dict["z_p"].shape, (8, config.model_args.hidden_channels, 30))
-        self.assertEqual(output_dict["m_p"].shape, (8, config.model_args.hidden_channels, 30))
-        self.assertEqual(output_dict["logs_p"].shape, (8, config.model_args.hidden_channels, 30))
-        self.assertEqual(output_dict["m_q"].shape, (8, config.model_args.hidden_channels, 30))
-        self.assertEqual(output_dict["logs_q"].shape, (8, config.model_args.hidden_channels, 30))
+        self.assertEqual(output_dict["z"].shape, (batch_size, config.model_args.hidden_channels, 30))
+        self.assertEqual(output_dict["z_p"].shape, (batch_size, config.model_args.hidden_channels, 30))
+        self.assertEqual(output_dict["m_p"].shape, (batch_size, config.model_args.hidden_channels, 30))
+        self.assertEqual(output_dict["logs_p"].shape, (batch_size, config.model_args.hidden_channels, 30))
+        self.assertEqual(output_dict["m_q"].shape, (batch_size, config.model_args.hidden_channels, 30))
+        self.assertEqual(output_dict["logs_q"].shape, (batch_size, config.model_args.hidden_channels, 30))
         self.assertEqual(
             output_dict["waveform_seg"].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"]
         )
         if encoder_config:
-            self.assertEqual(output_dict["gt_spk_emb"].shape, (8, encoder_config.model_params["proj_dim"]))
-            self.assertEqual(output_dict["syn_spk_emb"].shape, (8, encoder_config.model_params["proj_dim"]))
+            self.assertEqual(output_dict["gt_spk_emb"].shape, (batch_size, encoder_config.model_params["proj_dim"]))
+            self.assertEqual(output_dict["syn_spk_emb"].shape, (batch_size, encoder_config.model_params["proj_dim"]))
         else:
             self.assertEqual(output_dict["gt_spk_emb"], None)
             self.assertEqual(output_dict["syn_spk_emb"], None)
@@ -137,7 +139,7 @@ def test_forward(self):
         num_speakers = 0
         config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True)
         config.model_args.spec_segment_size = 10
-        input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config)
+        input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config)
         model = Vits(config).to(device)
         output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform)
         self._check_forward_outputs(config, output_dict)
@@ -148,7 +150,7 @@ def test_multispeaker_forward(self):
         config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True)
         config.model_args.spec_segment_size = 10
 
-        input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config)
+        input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config)
         speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device)
 
         model = Vits(config).to(device)
@@ -157,16 +159,36 @@ def test_multispeaker_forward(self):
         )
         self._check_forward_outputs(config, output_dict)
 
+    def test_d_vector_forward(self):
+        batch_size = 2
+        args = VitsArgs(
+            spec_segment_size=10,
+            num_chars=32,
+            use_d_vector_file=True,
+            d_vector_dim=256,
+            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+        )
+        config = VitsConfig(model_args=args)
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        model.train()
+        input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config, batch_size=batch_size)
+        d_vectors = torch.randn(batch_size, 256).to(device)
+        output_dict = model.forward(
+            input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"d_vectors": d_vectors}
+        )
+        self._check_forward_outputs(config, output_dict)
+
     def test_multilingual_forward(self):
         num_speakers = 10
         num_langs = 3
+        batch_size = 2
 
         args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, spec_segment_size=10)
         config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args)
 
-        input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config)
-        speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device)
-        lang_ids = torch.randint(0, num_langs, (8,)).long().to(device)
+        input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config, batch_size=batch_size)
+        speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device)
+        lang_ids = torch.randint(0, num_langs, (batch_size,)).long().to(device)
 
         model = Vits(config).to(device)
         output_dict = model.forward(
@@ -182,6 +204,7 @@ def test_multilingual_forward(self):
     def test_secl_forward(self):
         num_speakers = 10
         num_langs = 3
+        batch_size = 2
 
         speaker_encoder_config = load_config(SPEAKER_ENCODER_CONFIG)
         speaker_encoder_config.model_params["use_torch_spec"] = True
@@ -198,9 +221,9 @@ def test_secl_forward(self):
         config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args)
         config.audio.sample_rate = 16000
 
-        input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config)
-        speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device)
-        lang_ids = torch.randint(0, num_langs, (8,)).long().to(device)
+        input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config, batch_size=batch_size)
+        speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device)
+        lang_ids = torch.randint(0, num_langs, (batch_size,)).long().to(device)
 
         model = Vits(config, speaker_manager=speaker_manager).to(device)
         output_dict = model.forward(
@@ -213,28 +236,228 @@ def test_secl_forward(self):
         )
         self._check_forward_outputs(config, output_dict, speaker_encoder_config)
 
+    def _check_inference_outputs(self, config, outputs, input_dummy, batch_size=1):
+        feat_len = outputs["z"].shape[2]
+        self.assertEqual(outputs["model_outputs"].shape[:2], (batch_size, 1))  # we don't know the channel dimension
+        self.assertEqual(outputs["alignments"].shape, (batch_size, input_dummy.shape[1], feat_len))
+        self.assertEqual(outputs["z"].shape, (batch_size, config.model_args.hidden_channels, feat_len))
+        self.assertEqual(outputs["z_p"].shape, (batch_size, config.model_args.hidden_channels, feat_len))
+        self.assertEqual(outputs["m_p"].shape, (batch_size, config.model_args.hidden_channels, feat_len))
+        self.assertEqual(outputs["logs_p"].shape, (batch_size, config.model_args.hidden_channels, feat_len))
+
     def test_inference(self):
         num_speakers = 0
         config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True)
-        input_dummy = torch.randint(0, 24, (1, 128)).long().to(device)
         model = Vits(config).to(device)
-        _ = model.inference(input_dummy)
+
+        batch_size = 1
+        input_dummy, *_ = self._create_inputs(config, batch_size=batch_size)
+        outputs = model.inference(input_dummy)
+        self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size)
+
+        batch_size = 2
+        input_dummy, input_lengths, *_ = self._create_inputs(config, batch_size=batch_size)
+        outputs = model.inference(input_dummy, aux_input={"x_lengths": input_lengths})
+        self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size)
 
     def test_multispeaker_inference(self):
         num_speakers = 10
         config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True)
-        input_dummy = torch.randint(0, 24, (1, 128)).long().to(device)
-        speaker_ids = torch.randint(0, num_speakers, (1,)).long().to(device)
         model = Vits(config).to(device)
-        _ = model.inference(input_dummy, {"speaker_ids": speaker_ids})
+
+        batch_size = 1
+        input_dummy, *_ = self._create_inputs(config, batch_size=batch_size)
+        speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device)
+        outputs = model.inference(input_dummy, {"speaker_ids": speaker_ids})
+        self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size)
+
+        batch_size = 2
+        input_dummy, input_lengths, *_ = self._create_inputs(config, batch_size=batch_size)
+        speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device)
+        outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "speaker_ids": speaker_ids})
+        self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size)
 
     def test_multilingual_inference(self):
         num_speakers = 10
         num_langs = 3
         args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, spec_segment_size=10)
         config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args)
+        model = Vits(config).to(device)
+
         input_dummy = torch.randint(0, 24, (1, 128)).long().to(device)
         speaker_ids = torch.randint(0, num_speakers, (1,)).long().to(device)
         lang_ids = torch.randint(0, num_langs, (1,)).long().to(device)
-        model = Vits(config).to(device)
         _ = model.inference(input_dummy, {"speaker_ids": speaker_ids, "language_ids": lang_ids})
+
+        batch_size = 1
+        input_dummy, *_ = self._create_inputs(config, batch_size=batch_size)
+        speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device)
+        lang_ids = torch.randint(0, num_langs, (batch_size,)).long().to(device)
+        outputs = model.inference(input_dummy, {"speaker_ids": speaker_ids, "language_ids": lang_ids})
+        self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size)
+
+        batch_size = 2
+        input_dummy, input_lengths, *_ = self._create_inputs(config, batch_size=batch_size)
+        speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device)
+        lang_ids = torch.randint(0, num_langs, (batch_size,)).long().to(device)
+        outputs = model.inference(
+            input_dummy, {"x_lengths": input_lengths, "speaker_ids": speaker_ids, "language_ids": lang_ids}
+        )
+        self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size)
+
+    def test_d_vector_inference(self):
+        args = VitsArgs(
+            spec_segment_size=10,
+            num_chars=32,
+            use_d_vector_file=True,
+            d_vector_dim=256,
+            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+        )
+        config = VitsConfig(model_args=args)
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        model.eval()
+        # batch size = 1
+        input_dummy = torch.randint(0, 24, (1, 128)).long().to(device)
+        d_vectors = torch.randn(1, 256).to(device)
+        outputs = model.inference(input_dummy, aux_input={"d_vectors": d_vectors})
+        self._check_inference_outputs(config, outputs, input_dummy)
+        # batch size = 2
+        input_dummy, input_lengths, *_ = self._create_inputs(config)
+        d_vectors = torch.randn(2, 256).to(device)
+        outputs = model.inference(input_dummy, aux_input={"x_lengths": input_lengths, "d_vectors": d_vectors})
+        self._check_inference_outputs(config, outputs, input_dummy, batch_size=2)
+
+    @staticmethod
+    def _check_parameter_changes(model, model_ref):
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
+                count, param.shape, param, param_ref
+            )
+            count += 1
+
+    def _create_batch(self, config, batch_size):
+        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(config, batch_size)
+        batch = {}
+        batch["text_input"] = input_dummy
+        batch["text_lengths"] = input_lengths
+        batch["mel_lengths"] = mel_lengths
+        batch["linear_input"] = mel_spec.transpose(1, 2)
+        batch["waveform"] = torch.rand(batch_size, config.audio["sample_rate"] * 10, 1).to(device)
+        batch["d_vectors"] = None
+        batch["speaker_ids"] = None
+        batch["language_ids"] = None
+        return batch
+
+    def test_train_step(self):
+        # setup the model
+        config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10))
+        model = Vits(config).to(device)
+        # create a batch
+        batch = self._create_batch(config, 1)
+        # model to train
+        criterions = model.get_criterion()
+        criterions = [criterions[0].to(device), criterions[1].to(device)]
+        # reference model to compare model weights
+        model_ref = Vits(config).to(device)
+        model.train()
+        # pass the state to ref model
+        model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            assert (param - param_ref).sum() == 0, param
+            count += 1
+        optimizers = model.get_optimizer()
+        for _ in range(5):
+            _, loss_dict = model.train_step(batch, criterions, 0)
+            loss = loss_dict["loss"]
+            loss.backward()
+            optimizers[0].step()
+
+            _, loss_dict = model.train_step(batch, criterions, 1)
+            loss = loss_dict["loss"]
+            loss.backward()
+            optimizers[1].step()
+        # check parameter changes
+        self._check_parameter_changes(model, model_ref)
+
+    def test_train_eval_log(self):
+        batch_size = 2
+        config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10))
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        model.run_data_dep_init = False
+        model.train()
+        batch = self._create_batch(config, batch_size)
+        logger = TensorboardLogger(
+            log_dir=os.path.join(get_tests_output_path(), "dummy_vits_logs"), model_name="vits_test_train_log"
+        )
+        criterion = model.get_criterion()
+        criterion = [criterion[0].to(device), criterion[1].to(device)]
+        outputs = [None] * 2
+        outputs[0], _ = model.train_step(batch, criterion, 0)
+        outputs[1], _ = model.train_step(batch, criterion, 1)
+        model.train_log(batch, outputs, logger, None, 1)
+
+        model.eval_log(batch, outputs, logger, None, 1)
+        logger.finish()
+
+    def test_test_run(self):
+        config = VitsConfig(model_args=VitsArgs(num_chars=32))
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        model.run_data_dep_init = False
+        model.eval()
+        test_figures, test_audios = model.test_run(None)
+        self.assertTrue(test_figures is not None)
+        self.assertTrue(test_audios is not None)
+
+    def test_load_checkpoint(self):
+        chkp_path = os.path.join(get_tests_output_path(), "dummy_glow_tts_checkpoint.pth")
+        config = VitsConfig(VitsArgs(num_chars=32))
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        chkp = {}
+        chkp["model"] = model.state_dict()
+        torch.save(chkp, chkp_path)
+        model.load_checkpoint(config, chkp_path)
+        self.assertTrue(model.training)
+        model.load_checkpoint(config, chkp_path, eval=True)
+        self.assertFalse(model.training)
+
+    def test_get_criterion(self):
+        config = VitsConfig(VitsArgs(num_chars=32))
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        criterion = model.get_criterion()
+        self.assertTrue(criterion is not None)
+
+    def test_init_from_config(self):
+        config = VitsConfig(model_args=VitsArgs(num_chars=32))
+        model = Vits.init_from_config(config, verbose=False).to(device)
+
+        config = VitsConfig(model_args=VitsArgs(num_chars=32, num_speakers=2))
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        self.assertTrue(not hasattr(model, "emb_g"))
+
+        config = VitsConfig(model_args=VitsArgs(num_chars=32, num_speakers=2, use_speaker_embedding=True))
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        self.assertEqual(model.num_speakers, 2)
+        self.assertTrue(hasattr(model, "emb_g"))
+
+        config = VitsConfig(model_args=VitsArgs(
+            num_chars=32,
+            num_speakers=2,
+            use_speaker_embedding=True,
+            speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"),
+        ))
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        self.assertEqual(model.num_speakers, 10)
+        self.assertTrue(hasattr(model, "emb_g"))
+
+        config = VitsConfig(model_args=VitsArgs(
+            num_chars=32,
+            use_d_vector_file=True,
+            d_vector_dim=256,
+            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+        ))
+        model = Vits.init_from_config(config, verbose=False).to(device)
+        self.assertTrue(model.num_speakers == 1)
+        self.assertTrue(not hasattr(model, "emb_g"))
+        self.assertTrue(model.embedded_speaker_dim == config.d_vector_dim)

From 4b612d71a392212bd31ac428b716873060c8950f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 13 Jan 2022 17:43:05 +0000
Subject: [PATCH 46/67] Make lint

---
 tests/tts_tests/test_vits.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
index 53e7c09e98..eaa325b002 100644
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@@ -1,7 +1,6 @@
 import copy
 import os
 import unittest
-from TTS.utils.logging.tensorboard_logger import TensorboardLogger
 
 import torch
 
@@ -11,6 +10,7 @@
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.models.vits import Vits, VitsArgs
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.utils.logging.tensorboard_logger import TensorboardLogger
 
 LANG_FILE = os.path.join(get_tests_input_path(), "language_ids.json")
 SPEAKER_ENCODER_CONFIG = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
@@ -337,7 +337,7 @@ def _check_parameter_changes(model, model_ref):
             count += 1
 
     def _create_batch(self, config, batch_size):
-        input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(config, batch_size)
+        input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs(config, batch_size)
         batch = {}
         batch["text_input"] = input_dummy
         batch["text_lengths"] = input_lengths
@@ -441,22 +441,26 @@ def test_init_from_config(self):
         self.assertEqual(model.num_speakers, 2)
         self.assertTrue(hasattr(model, "emb_g"))
 
-        config = VitsConfig(model_args=VitsArgs(
-            num_chars=32,
-            num_speakers=2,
-            use_speaker_embedding=True,
-            speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"),
-        ))
+        config = VitsConfig(
+            model_args=VitsArgs(
+                num_chars=32,
+                num_speakers=2,
+                use_speaker_embedding=True,
+                speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"),
+            )
+        )
         model = Vits.init_from_config(config, verbose=False).to(device)
         self.assertEqual(model.num_speakers, 10)
         self.assertTrue(hasattr(model, "emb_g"))
 
-        config = VitsConfig(model_args=VitsArgs(
-            num_chars=32,
-            use_d_vector_file=True,
-            d_vector_dim=256,
-            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
-        ))
+        config = VitsConfig(
+            model_args=VitsArgs(
+                num_chars=32,
+                use_d_vector_file=True,
+                d_vector_dim=256,
+                d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+            )
+        )
         model = Vits.init_from_config(config, verbose=False).to(device)
         self.assertTrue(model.num_speakers == 1)
         self.assertTrue(not hasattr(model, "emb_g"))

From 24336262f5093a76e0b2a2f0e1794bfcd40e5d43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 14 Jan 2022 12:10:39 +0000
Subject: [PATCH 47/67] Fix tests

---
 TTS/bin/find_unique_phonemes.py              | 8 +++++---
 tests/aux_tests/test_find_unique_phonemes.py | 2 --
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py
index d3143ca324..e84c17de2f 100644
--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@@ -7,14 +7,16 @@
 
 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.text import text2phone
+from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
+
+
+phonemizer = Gruut(language="en-us")
 
 
 def compute_phonemes(item):
     try:
         text = item[0]
-        language = item[-1]
-        ph = text2phone(text, language, use_espeak_phonemes=c.use_espeak_phonemes).split("|")
+        ph = phonemizer.phonemize(text).split("|")
     except:
         return []
     return list(set(ph))
diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py
index fa0abe4b92..fa740ba361 100644
--- a/tests/aux_tests/test_find_unique_phonemes.py
+++ b/tests/aux_tests/test_find_unique_phonemes.py
@@ -39,7 +39,6 @@ def test_espeak_phonemes():
             num_eval_loader_workers=0,
             text_cleaner="english_cleaners",
             use_phonemes=True,
-            use_espeak_phonemes=True,
             phoneme_language="en-us",
             phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
             run_eval=True,
@@ -64,7 +63,6 @@ def test_no_espeak_phonemes():
             num_eval_loader_workers=0,
             text_cleaner="english_cleaners",
             use_phonemes=True,
-            use_espeak_phonemes=False,
             phoneme_language="en-us",
             phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
             run_eval=True,

From 911b2dbab95b5af640d6ebdf2131b8df6bc12e27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 14 Jan 2022 12:10:54 +0000
Subject: [PATCH 48/67] Fix docstring

---
 TTS/tts/datasets/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index 40eed7e365..07f3d99ce8 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -13,7 +13,7 @@ def split_dataset(items):
     """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
 
     Args:
-        items (List[List]): A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
+        items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`.
     """
     speakers = [item[-1] for item in items]
     is_multi_speaker = len(set(speakers)) > 1
@@ -52,7 +52,7 @@ def load_tts_samples(
 
         formatter (Callable, optional): The preprocessing function to be applied to create the list of samples. It
             must take the root_path and the meta_file name and return a list of samples in the format of
-            `[[audio_path, text, speaker_id], ...]]`. See the available formatters in `TTS.tts.dataset.formatter` as
+            `[[text, audio_path, speaker_id], ...]]`. See the available formatters in `TTS.tts.dataset.formatter` as
             example. Defaults to None.
 
     Returns:

From 2472d43124f44ac6bed14072e673ba1f5d1921c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 21 Jan 2022 15:27:41 +0000
Subject: [PATCH 49/67] Allow padding for shorter segments

---
 TTS/tts/utils/helpers.py        | 37 ++++++++++++++++++++++++++-------
 tests/tts_tests/test_helpers.py | 30 +++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py
index b0a010b0b1..3251337768 100644
--- a/TTS/tts/utils/helpers.py
+++ b/TTS/tts/utils/helpers.py
@@ -57,40 +57,61 @@ def sequence_mask(sequence_length, max_len=None):
     return mask
 
 
-def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4):
+def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_short=False):
     """Segment each sample in a batch based on the provided segment indices
 
     Args:
         x (torch.tensor): Input tensor.
         segment_indices (torch.tensor): Segment indices.
         segment_size (int): Expected output segment size.
+        pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size.
     """
+    # pad the input tensor if it is shorter than the segment size
+    if pad_short and x.shape[-1] < segment_size:
+        x = torch.nn.functional.pad(x, (0,  segment_size - x.size(2)))
+
     segments = torch.zeros_like(x[:, :, :segment_size])
+
     for i in range(x.size(0)):
         index_start = segment_indices[i]
         index_end = index_start + segment_size
-        segments[i] = x[i, :, index_start:index_end]
+        x_i = x[i]
+        if pad_short and index_end > x.size(2):
+            # pad the sample if it is shorter than the segment size
+             x_i = torch.nn.functional.pad(x_i, (0,  (index_end + 1) - x.size(2)))
+        segments[i] = x_i[:, index_start:index_end]
     return segments
 
 
-def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4):
+def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4, let_short_samples=False, pad_short=False):
     """Create random segments based on the input lengths.
 
     Args:
         x (torch.tensor): Input tensor.
         x_lengths (torch.tensor): Input lengths.
         segment_size (int): Expected output segment size.
+        let_short_samples (bool): Allow shorter samples than the segment size.
+        pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size.
 
     Shapes:
         - x: :math:`[B, C, T]`
         - x_lengths: :math:`[B]`
     """
+    _x_lenghts = x_lengths.clone()
     B, _, T = x.size()
-    if x_lengths is None:
-        x_lengths = T
-    max_idxs = x_lengths - segment_size + 1
-    assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size."
-    segment_indices = (torch.rand([B]).type_as(x) * max_idxs).long()
+    if pad_short:
+        if T < segment_size:
+            x = torch.nn.functional.pad(x, (0, segment_size - T))
+            T = segment_size
+    if _x_lenghts is None:
+        _x_lenghts = T
+    len_diff = _x_lenghts - segment_size + 1
+    if let_short_samples:
+        _x_lenghts[len_diff < 0] = segment_size
+        len_diff = _x_lenghts - segment_size + 1
+    else:
+        assert all(len_diff > 0), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
+    segment_indices = (torch.rand([B]).type_as(x) * len_diff).long()
     ret = segment(x, segment_indices, segment_size)
     return ret, segment_indices
 
diff --git a/tests/tts_tests/test_helpers.py b/tests/tts_tests/test_helpers.py
index 6a2f260d28..708ecbf50e 100644
--- a/tests/tts_tests/test_helpers.py
+++ b/tests/tts_tests/test_helpers.py
@@ -1,6 +1,6 @@
 import torch as T
 
-from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask
+from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask, rand_segments
 
 
 def average_over_durations_test():  # pylint: disable=no-self-use
@@ -39,6 +39,34 @@ def segment_test():
     for idx, start_indx in enumerate(segment_ids):
         assert x[idx, :, start_indx : start_indx + 4].sum() == segments[idx, :, :].sum()
 
+    try:
+        segments = segment(x, segment_ids, segment_size=10)
+        raise Exception("Should have failed")
+    except:
+        pass
+
+    segments = segment(x, segment_ids, segment_size=10, pad_short=True)
+    for idx, start_indx in enumerate(segment_ids):
+        assert x[idx, :, start_indx : start_indx + 10].sum() == segments[idx, :, :].sum()
+
+
+def rand_segments_test():
+    x = T.rand(2, 3, 4)
+    x_lens = T.randint(3, 4, (2,))
+    segments, seg_idxs = rand_segments(x, x_lens, segment_size=3)
+    assert segments.shape == (2, 3, 3)
+    assert all(seg_idxs >= 0), seg_idxs
+    try:
+        segments, _  = rand_segments(x, x_lens, segment_size=5)
+        raise Exception("Should have failed")
+    except:
+        pass
+    x_lens_back = x_lens.clone()
+    segments, seg_idxs= rand_segments(x, x_lens.clone(), segment_size=5, pad_short=True, let_short_samples=True)
+    assert segments.shape == (2, 3, 5)
+    assert all(seg_idxs >= 0), seg_idxs
+    assert all(x_lens_back == x_lens)
+
 
 def generate_path_test():
     durations = T.randint(1, 4, (10, 21))

From 8c555d39070cbe869b41f86e6189febc8f01c7b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 21 Jan 2022 15:29:06 +0000
Subject: [PATCH 50/67] Implement `start_by_longest` option for TTSDatase

---
 TTS/tts/configs/shared_configs.py |  5 +++++
 TTS/tts/configs/vits_config.py    | 13 +------------
 TTS/tts/datasets/dataset.py       | 10 ++++++++++
 TTS/tts/models/base_tts.py        |  1 +
 tests/data_tests/test_loader.py   | 20 +++++++++++++++++++-
 5 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index c7958fda00..09266ce2e7 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -172,6 +172,10 @@ class BaseTTSConfig(BaseTrainingConfig):
         use_noise_augment (bool):
             Augment the input audio with random noise.
 
+        start_by_longest (bool):
+            If True, the data loader will start loading the longest batch first. It is useful for checking OOM issues.
+            Defaults to False.
+
         add_blank (bool):
             Add blank characters between each other two characters. It improves performance for some models at expense
             of slower run-time due to the longer input sequence.
@@ -224,6 +228,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     compute_linear_spec: bool = False
     precompute_num_workers: int = 0
     use_noise_augment: bool = False
+    start_by_longest: bool = False
     # dataset
     datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # optimizer
diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py
index 36c948afd5..d306552df3 100644
--- a/TTS/tts/configs/vits_config.py
+++ b/TTS/tts/configs/vits_config.py
@@ -67,15 +67,6 @@ class VitsConfig(BaseTTSConfig):
         compute_linear_spec (bool):
             If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
 
-        sort_by_audio_len (bool):
-            If true, dataloder sorts the data by audio length else sorts by the input text length. Defaults to `True`.
-
-        min_seq_len (int):
-            Minimum sequnce length to be considered for training. Defaults to `0`.
-
-        max_seq_len (int):
-            Maximum sequnce length to be considered for training. Defaults to `500000`.
-
         r (int):
             Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
 
@@ -123,6 +114,7 @@ class VitsConfig(BaseTTSConfig):
     feat_loss_alpha: float = 1.0
     mel_loss_alpha: float = 45.0
     dur_loss_alpha: float = 1.0
+    aligner_loss_alpha = 1.0
     speaker_encoder_loss_alpha: float = 1.0
 
     # data loader params
@@ -130,9 +122,6 @@ class VitsConfig(BaseTTSConfig):
     compute_linear_spec: bool = True
 
     # overrides
-    sort_by_audio_len: bool = True
-    min_seq_len: int = 0
-    max_seq_len: int = 500000
     r: int = 1  # DO NOT CHANGE
     add_blank: bool = True
 
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 5fab71088d..99d9429937 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -56,6 +56,7 @@ def __init__(
         d_vector_mapping: Dict = None,
         language_id_mapping: Dict = None,
         use_noise_augment: bool = False,
+        start_by_longest: bool = False,
         verbose: bool = False,
     ):
         """Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs.
@@ -109,6 +110,8 @@ def __init__(
 
             use_noise_augment (bool): Enable adding random noise to wav for augmentation. Defaults to False.
 
+            start_by_longest (bool): Start by longest sequence. It is especially useful to check OOM. Defaults to False.
+
             verbose (bool): Print diagnostic information. Defaults to false.
         """
         super().__init__()
@@ -130,6 +133,7 @@ def __init__(
         self.d_vector_mapping = d_vector_mapping
         self.language_id_mapping = language_id_mapping
         self.use_noise_augment = use_noise_augment
+        self.start_by_longest = start_by_longest
 
         self.verbose = verbose
         self.rescue_item_idx = 1
@@ -316,6 +320,12 @@ def preprocess_samples(self):
         samples, audio_lengths, _ = self.select_samples_by_idx(keep_idx)
 
         sorted_idxs = self.sort_by_length(audio_lengths)
+
+        if self.start_by_longest:
+            longest_idxs = sorted_idxs[-1]
+            sorted_idxs[-1] = sorted_idxs[0]
+            sorted_idxs[0] = longest_idxs
+
         samples, audio_lengths, text_lengtsh = self.select_samples_by_idx(sorted_idxs)
 
         if len(samples) == 0:
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 9a6a56df76..7cdfa915df 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -290,6 +290,7 @@ def get_data_loader(
                 speaker_id_mapping=speaker_id_mapping,
                 d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
                 tokenizer=self.tokenizer,
+                start_by_longest=config.start_by_longest,
                 language_id_mapping=language_id_mapping,
             )
 
diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
index f2f2a8d238..477ee71fb0 100644
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@@ -37,7 +37,7 @@ def __init__(self, *args, **kwargs):
         self.max_loader_iter = 4
         self.ap = AudioProcessor(**c.audio)
 
-    def _create_dataloader(self, batch_size, r, bgs):
+    def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):
         items = ljspeech(c.data_path, "metadata.csv")
         tokenizer, _ = TTSTokenizer.init_from_config(c)
         dataset = TTSDataset(
@@ -52,6 +52,7 @@ def _create_dataloader(self, batch_size, r, bgs):
             max_text_len=c.max_text_len,
             min_audio_len=c.min_audio_len,
             max_audio_len=c.max_audio_len,
+            start_by_longest=start_by_longest
         )
         dataloader = DataLoader(
             dataset,
@@ -127,6 +128,23 @@ def test_batch_group_shuffle(self):
             self.assertGreaterEqual(avg_length, last_length)
             self.assertTrue(is_items_reordered)
 
+    def test_start_by_longest(self):
+        """Test start_by_longest option.
+
+        Ther first item of the fist batch must be longer than all the other items.
+        """
+        if ok_ljspeech:
+            dataloader, _ = self._create_dataloader(2, c.r, 0, True)
+            dataloader.dataset.preprocess_samples()
+            for i, data in enumerate(dataloader):
+                if i == self.max_loader_iter:
+                    break
+                mel_lengths = data["mel_lengths"]
+                if i == 0:
+                    max_len = mel_lengths[0]
+                print(mel_lengths)
+                self.assertTrue(all(max_len >= mel_lengths))
+
     def test_padding_and_spectrograms(self):
         def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
             self.assertNotEqual(linear_input[idx, -1].sum(), 0)  # check padding

From c3ae11482d71572db0424112a78fce2b2e9c5d82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 21 Jan 2022 15:33:15 +0000
Subject: [PATCH 51/67] Refactor VITS model

---
 TTS/tts/models/vits.py | 106 ++++++++++++++++++++++++++---------------
 1 file changed, 68 insertions(+), 38 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 34cb69c8a7..301ddfcd6e 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -39,7 +39,7 @@ class VitsArgs(Coqpit):
             Number of characters in the vocabulary. Defaults to 100.
 
         out_channels (int):
-            Number of output channels. Defaults to 513.
+            Number of output channels of the decoder. Defaults to 513.
 
         spec_segment_size (int):
             Decoder input segment size. Defaults to 32 `(32 * hoplength = waveform length)`.
@@ -360,6 +360,8 @@ def __init__(
                 language_emb_dim=self.embedded_language_dim,
             )
 
+        upsample_rate = math.prod(self.args.upsample_rates_decoder)
+        assert upsample_rate == self.config.audio.hop_length, f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {self.config.audio.hop_length}"
         self.waveform_decoder = HifiganGenerator(
             self.args.hidden_channels,
             1,
@@ -536,6 +538,54 @@ def get_aux_input_from_test_sentences(self, sentence_info):
             "language_name": language_name,
         }
 
+    def _set_speaker_input(self, aux_input: Dict):
+        d_vectors = aux_input.get("d_vectors", None)
+        speaker_ids = aux_input.get("speaker_ids", None)
+
+        if d_vectors is not None and speaker_ids is not None:
+            raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
+
+        if speaker_ids is not None and not hasattr(self, "emb_g"):
+            raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
+
+        g = speaker_ids if speaker_ids is not None else d_vectors
+        return g
+
+    def forward_mas(self, outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g, lang_emb):
+        # find the alignment path
+        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
+        with torch.no_grad():
+            o_scale = torch.exp(-2 * logs_p)
+            logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1)  # [b, t, 1]
+            logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)])
+            logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p])
+            logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1)  # [b, t, 1]
+            logp = logp2 + logp3 + logp1 + logp4
+            attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()  # [b, 1, t, t']
+
+        # duration predictor
+        attn_durations = attn.sum(3)
+        if self.args.use_sdp:
+            loss_duration = self.duration_predictor(
+                x.detach() if self.args.detach_dp_input else x,
+                x_mask,
+                attn_durations,
+                g=g.detach() if self.args.detach_dp_input and g is not None else g,
+                lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb,
+            )
+            loss_duration = loss_duration / torch.sum(x_mask)
+        else:
+            attn_log_durations = torch.log(attn_durations + 1e-6) * x_mask
+            log_durations = self.duration_predictor(
+                x.detach() if self.args.detach_dp_input else x,
+                x_mask,
+                g=g.detach() if self.args.detach_dp_input and g is not None else g,
+                lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb,
+            )
+            loss_duration = torch.sum((log_durations - attn_log_durations) ** 2, [1, 2]) / torch.sum(x_mask)
+        outputs["loss_duration"] = loss_duration
+        return outputs, attn
+
     def forward(
         self,
         x: torch.tensor,
@@ -601,51 +651,27 @@ def forward(
         # flow layers
         z_p = self.flow(z, y_mask, g=g)
 
-        # find the alignment path
-        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
-        with torch.no_grad():
-            o_scale = torch.exp(-2 * logs_p)
-            logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1)  # [b, t, 1]
-            logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)])
-            logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p])
-            logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1)  # [b, t, 1]
-            logp = logp2 + logp3 + logp1 + logp4
-            attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()
-
         # duration predictor
-        attn_durations = attn.sum(3)
-        if self.args.use_sdp:
-            loss_duration = self.duration_predictor(
-                x.detach() if self.args.detach_dp_input else x,
-                x_mask,
-                attn_durations,
-                g=g.detach() if self.args.detach_dp_input and g is not None else g,
-                lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb,
-            )
-            loss_duration = loss_duration / torch.sum(x_mask)
-        else:
-            attn_log_durations = torch.log(attn_durations + 1e-6) * x_mask
-            log_durations = self.duration_predictor(
-                x.detach() if self.args.detach_dp_input else x,
-                x_mask,
-                g=g.detach() if self.args.detach_dp_input and g is not None else g,
-                lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb,
-            )
-            loss_duration = torch.sum((log_durations - attn_log_durations) ** 2, [1, 2]) / torch.sum(x_mask)
-        outputs["loss_duration"] = loss_duration
+        if self.args.use_mas:
+            outputs, attn = self.forward_mas(outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g=g, lang_emb=lang_emb)
+        elif self.args.use_aligner_network:
+            outputs, attn = self.forward_aligner(outputs, m_p, z_p, x_mask, y_mask, g=g, lang_emb=lang_emb)
+            outputs["x_lens"] = x_lengths
+            outputs["y_lens"] = y_lengths
 
         # expand prior
         m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p])
         logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p])
 
         # select a random feature segment for the waveform decoder
-        z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size)
+        z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size, let_short_samples=True, pad_short=True)
         o = self.waveform_decoder(z_slice, g=g)
 
         wav_seg = segment(
             waveform,
             slice_ids * self.config.audio.hop_length,
             self.args.spec_segment_size * self.config.audio.hop_length,
+            pad_short=True
         )
 
         if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None:
@@ -667,11 +693,11 @@ def forward(
         outputs.update(
             {
                 "model_outputs": o,
-                "alignments": attn.squeeze(1),
-                "z": z,
-                "z_p": z_p,
+                "alignments" : attn.squeeze(1),
                 "m_p": m_p,
                 "logs_p": logs_p,
+                "z": z,
+                "z_p": z_p,
                 "m_q": m_q,
                 "logs_q": logs_q,
                 "waveform_seg": wav_seg,
@@ -914,14 +940,18 @@ def train_log(
         Returns:
             Tuple[Dict, np.ndarray]: training plots and output waveform.
         """
-        self._log(self.ap, batch, outputs, "train")
+        figures, audios = self._log(self.ap, batch, outputs, "train")
+        logger.eval_figures(steps, figures)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @torch.no_grad()
     def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int):
         return self.train_step(batch, criterion, optimizer_idx)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        return self._log(self.ap, batch, outputs, "eval")
+        figures, audios =  self._log(self.ap, batch, outputs, "eval")
+        logger.eval_figures(steps, figures)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @torch.no_grad()
     def test_run(self, assets) -> Tuple[Dict, Dict]:

From c94112f63353443b0f36131dd34e3538ec6fcf2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 09:22:35 +0000
Subject: [PATCH 52/67] Update GAN model

---
 TTS/vocoder/models/gan.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index f78d69b86e..7e03e94f2e 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -19,7 +19,7 @@
 
 
 class GAN(BaseVocoder):
-    def __init__(self, config: Coqpit):
+    def __init__(self, config: Coqpit, ap: AudioProcessor=None):
         """Wrap a generator and a discriminator network. It provides a compatible interface for the trainer.
         It also helps mixing and matching different generator and disciminator networks easily.
 
@@ -28,6 +28,7 @@ def __init__(self, config: Coqpit):
 
         Args:
             config (Coqpit): Model configuration.
+            ap (AudioProcessor): 🐸TTS AudioProcessor instance. Defaults to None.
 
         Examples:
             Initializing the GAN model with HifiGAN generator and discriminator.
@@ -41,6 +42,7 @@ def __init__(self, config: Coqpit):
         self.model_d = setup_discriminator(config)
         self.train_disc = False  # if False, train only the generator.
         self.y_hat_g = None  # the last generator prediction to be passed onto the discriminator
+        self.ap = ap
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Run the generator's forward pass.
@@ -201,10 +203,9 @@ def train_log(
         self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
     ) -> Tuple[Dict, np.ndarray]:
         """Call `_log()` for training."""
-        ap = assets["audio_processor"]
-        figures, audios = self._log("eval", ap, batch, outputs)
+        figures, audios = self._log("eval", self.ap, batch, outputs)
         logger.eval_figures(steps, figures)
-        logger.eval_audios(steps, audios, ap.sample_rate)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @torch.no_grad()
     def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
@@ -215,10 +216,9 @@ def eval_log(
         self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
     ) -> Tuple[Dict, np.ndarray]:
         """Call `_log()` for evaluation."""
-        ap = assets["audio_processor"]
-        figures, audios = self._log("eval", ap, batch, outputs)
+        figures, audios = self._log("eval", self.ap, batch, outputs)
         logger.eval_figures(steps, figures)
-        logger.eval_audios(steps, audios, ap.sample_rate)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     def load_checkpoint(
         self,
@@ -330,12 +330,11 @@ def get_data_loader(  # pylint: disable=no-self-use
         Returns:
             DataLoader: Torch dataloader.
         """
-        ap = assets["audio_processor"]
         dataset = GANDataset(
-            ap=ap,
+            ap=self.ap,
             items=data_items,
             seq_len=config.seq_len,
-            hop_len=ap.hop_length,
+            hop_len=self.ap.hop_length,
             pad_short=config.pad_short,
             conv_pad=config.conv_pad,
             return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False,
@@ -363,5 +362,6 @@ def get_criterion(self):
         return [GeneratorLoss(self.config), DiscriminatorLoss(self.config)]
 
     @staticmethod
-    def init_from_config(config: Coqpit) -> "GAN":
-        return GAN(config)
+    def init_from_config(config: Coqpit, verbose=True) -> "GAN":
+        ap = AudioProcessor.init_from_config(config, verbose=verbose)
+        return GAN(config, ap=ap)

From 2386d804f13997f2d1eb6866e03f6bd6b3a78e91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 09:23:07 +0000
Subject: [PATCH 53/67] Take file extension as an argument

---
 TTS/vocoder/datasets/preprocess.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py
index d8cc350ad7..0f69b812fa 100644
--- a/TTS/vocoder/datasets/preprocess.py
+++ b/TTS/vocoder/datasets/preprocess.py
@@ -33,8 +33,8 @@ def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor):
             np.save(quant_path, quant)
 
 
-def find_wav_files(data_path):
-    wav_paths = glob.glob(os.path.join(data_path, "**", "*.wav"), recursive=True)
+def find_wav_files(data_path, file_ext="wav"):
+    wav_paths = glob.glob(os.path.join(data_path, "**", f"*.{file_ext}"), recursive=True)
     return wav_paths
 
 
@@ -43,8 +43,9 @@ def find_feat_files(data_path):
     return feat_paths
 
 
-def load_wav_data(data_path, eval_split_size):
-    wav_paths = find_wav_files(data_path)
+def load_wav_data(data_path, eval_split_size, file_ext="wav"):
+    wav_paths = find_wav_files(data_path, file_ext=file_ext)
+    assert len(wav_paths) > 0, f" [!] {data_path} is empty."
     np.random.seed(0)
     np.random.shuffle(wav_paths)
     return wav_paths[:eval_split_size], wav_paths[eval_split_size:]

From 269f8c6ee34834f221c0f8ea15874a822de09f1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 09:25:32 +0000
Subject: [PATCH 54/67] Update synthesizer to use iinit_from_config

---
 TTS/utils/synthesizer.py | 52 ----------------------------------------
 1 file changed, 52 deletions(-)

diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index a1a323e819..ddc2a6a545 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -110,21 +110,12 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -
             use_cuda (bool): enable/disable CUDA use.
         """
         # pylint: disable=global-statement
-
         self.tts_config = load_config(tts_config_path)
         self.use_phonemes = self.tts_config.use_phonemes
         self.tts_model = setup_tts_model(config=self.tts_config)
 
-        speaker_manager = self._init_speaker_manager()
-        language_manager = self._init_language_manager()
         if not self.encoder_checkpoint:
             self._set_speaker_encoder_paths_from_tts_config()
-        speaker_manager = self._init_speaker_encoder(speaker_manager)
-
-        if language_manager is not None:
-            self.tts_model = setup_tts_model(config=self.tts_config)
-        else:
-            self.tts_model = setup_tts_model(config=self.tts_config)
         self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
         if use_cuda:
             self.tts_model.cuda()
@@ -157,49 +148,6 @@ def _is_use_d_vector_file(self):
         use_d_vector_file = use_d_vector_file or config.get("use_d_vector_file", False)
         return use_d_vector_file
 
-    def _init_speaker_manager(self):
-        """Initialize the SpeakerManager"""
-        # setup if multi-speaker settings are in the global model config
-        speaker_manager = None
-        speakers_file = get_from_config_or_model_args_with_default(self.tts_config, "speakers_file", None)
-        if self._is_use_speaker_embedding():
-            if self.tts_speakers_file:
-                speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_speakers_file)
-            elif speakers_file:
-                speaker_manager = SpeakerManager(speaker_id_file_path=speakers_file)
-
-        if self._is_use_d_vector_file():
-            d_vector_file = get_from_config_or_model_args_with_default(self.tts_config, "d_vector_file", None)
-            if self.tts_speakers_file:
-                speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_speakers_file)
-            elif d_vector_file:
-                speaker_manager = SpeakerManager(d_vectors_file_path=d_vector_file)
-        return speaker_manager
-
-    def _init_speaker_encoder(self, speaker_manager):
-        """Initialize the SpeakerEncoder"""
-        if self.encoder_checkpoint:
-            if speaker_manager is None:
-                speaker_manager = SpeakerManager(
-                    encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config
-                )
-            else:
-                speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config)
-        return speaker_manager
-
-    def _init_language_manager(self):
-        """Initialize the LanguageManager"""
-        # setup if multi-lingual settings are in the global model config
-        language_manager = None
-        if check_config_and_model_args(self.tts_config, "use_language_embedding", True):
-            if self.tts_languages_file:
-                language_manager = LanguageManager(language_ids_file_path=self.tts_languages_file)
-            elif self.tts_config.get("language_ids_file", None):
-                language_manager = LanguageManager(language_ids_file_path=self.tts_config.language_ids_file)
-            else:
-                language_manager = LanguageManager(config=self.tts_config)
-        return language_manager
-
     def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None:
         """Load the vocoder model.
 

From a27133db12d7e57433bda7b7fbdabbaa63f18458 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 09:26:23 +0000
Subject: [PATCH 55/67] Add pitch_fmin pitch_fmax args to the audio

---
 TTS/utils/audio.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py
index e92acf574e..acd2cfcb86 100644
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@@ -239,6 +239,12 @@ class AudioProcessor(object):
         mel_fmax (int, optional):
             maximum filter frequency for computing melspectrograms. Defaults to None.
 
+        pitch_fmin (int, optional):
+            minimum filter frequency for computing pitch. Defaults to None.
+
+        pitch_fmax (int, optional):
+            maximum filter frequency for computing pitch. Defaults to None.
+
         spec_gain (int, optional):
             gain applied when converting amplitude to DB. Defaults to 20.
 
@@ -300,6 +306,8 @@ def __init__(
         max_norm=None,
         mel_fmin=None,
         mel_fmax=None,
+        pitch_fmax=None,
+        pitch_fmin=None,
         spec_gain=20,
         stft_pad_mode="reflect",
         clip_norm=True,
@@ -333,6 +341,8 @@ def __init__(
         self.symmetric_norm = symmetric_norm
         self.mel_fmin = mel_fmin or 0
         self.mel_fmax = mel_fmax
+        self.pitch_fmin = pitch_fmin
+        self.pitch_fmax = pitch_fmax
         self.spec_gain = float(spec_gain)
         self.stft_pad_mode = stft_pad_mode
         self.max_norm = 1.0 if max_norm is None else float(max_norm)
@@ -726,12 +736,12 @@ def compute_f0(self, x: np.ndarray) -> np.ndarray:
             >>> WAV_FILE = filename = librosa.util.example_audio_file()
             >>> from TTS.config import BaseAudioConfig
             >>> from TTS.utils.audio import AudioProcessor
-            >>> conf = BaseAudioConfig(mel_fmax=8000)
+            >>> conf = BaseAudioConfig(pitch_fmax=8000)
             >>> ap = AudioProcessor(**conf)
             >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
             >>> pitch = ap.compute_f0(wav)
         """
-        assert self.mel_fmax is not None, " [!] Set `mel_fmax` before caling `compute_f0`."
+        assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
         # align F0 length to the spectrogram length
         if len(x) % self.hop_length == 0:
             x = np.pad(x, (0, self.hop_length // 2), mode="reflect")
@@ -739,7 +749,7 @@ def compute_f0(self, x: np.ndarray) -> np.ndarray:
         f0, t = pw.dio(
             x.astype(np.double),
             fs=self.sample_rate,
-            f0_ceil=self.mel_fmax,
+            f0_ceil=self.pitch_fmax,
             frame_period=1000 * self.hop_length / self.sample_rate,
         )
         f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)

From 2303c91f5a2634d1b7bf840ad958637345780cac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 09:26:47 +0000
Subject: [PATCH 56/67] Plot pitch over input characters

---
 TTS/tts/utils/visual.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/TTS/tts/utils/visual.py b/TTS/tts/utils/visual.py
index de6d95c5a0..4fd1f19cb8 100644
--- a/TTS/tts/utils/visual.py
+++ b/TTS/tts/utils/visual.py
@@ -87,6 +87,39 @@ def plot_pitch(pitch, spectrogram, ap=None, fig_size=(30, 10), output_fig=False)
     return fig
 
 
+def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False):
+    """Plot pitch curves on top of the input characters.
+
+    Args:
+        pitch (np.array): Pitch values.
+        chars (str): Characters to place to the x-axis.
+
+    Shapes:
+        pitch: :math:`(T,)`
+    """
+    old_fig_size = plt.rcParams["figure.figsize"]
+    if fig_size is not None:
+        plt.rcParams["figure.figsize"] = fig_size
+
+    fig, ax = plt.subplots()
+
+    x = np.array(range(len(chars)))
+    my_xticks = [c for c in chars]
+    plt.xticks(x, my_xticks)
+
+    ax.set_xlabel("characters")
+    ax.set_ylabel("freq")
+
+    ax2 = ax.twinx()
+    ax2.plot(pitch, linewidth=5.0, color="red")
+    ax2.set_ylabel("F0")
+
+    plt.rcParams["figure.figsize"] = old_fig_size
+    if not output_fig:
+        plt.close()
+    return fig
+
+
 def visualize(
     alignment,
     postnet_output,

From a8352d9fa9a3724a482945c61dc77cc2c5f891b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 09:27:13 +0000
Subject: [PATCH 57/67] Update language manager

---
 TTS/tts/utils/languages.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py
index 5cecbe6908..8f14d71735 100644
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@@ -1,6 +1,7 @@
 import json
 import os
 from typing import Dict, List
+from TTS.config import check_config_and_model_args
 
 import fsspec
 import numpy as np
@@ -105,7 +106,12 @@ def init_from_config(config: Coqpit) -> "LanguageManager":
         Args:
             config (Coqpit): Coqpit config.
         """
-        return LanguageManager(config=config)
+        language_manager = None
+        if check_config_and_model_args(config, "use_language_embedding", True):
+            if config.get("language_ids_file", None):
+                language_manager = LanguageManager(language_ids_file_path=config.language_ids_file)
+            language_manager = LanguageManager(config=config)
+        return language_manager
 
 
 def _set_file_path(path):

From 839750202320184f53b5cc4023564e03270695d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 09:28:33 +0000
Subject: [PATCH 58/67] Update forwardtts

---
 TTS/tts/layers/losses.py      |  6 +++++-
 TTS/tts/models/forward_tts.py | 29 ++++++++++++-----------------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index 7de4504142..75320d1078 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -740,6 +740,7 @@ def forward(
         alignment_logprob=None,
         alignment_hard=None,
         alignment_soft=None,
+        binary_loss_weight=None
     ):
         loss = 0
         return_dict = {}
@@ -772,7 +773,10 @@ def forward(
         if self.binary_alignment_loss_alpha > 0 and alignment_hard is not None:
             binary_alignment_loss = self._binary_alignment_loss(alignment_hard, alignment_soft)
             loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss
-            return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss
+            if binary_loss_weight:
+                return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight
+            else:
+                return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss
 
         return_dict["loss"] = loss
         return return_dict
diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py
index 699f31426c..bb8640a3dd 100644
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@@ -15,7 +15,7 @@
 from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
-from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram
+from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram
 
 
 @dataclass
@@ -186,7 +186,7 @@ def __init__(
         self.max_duration = self.args.max_duration
         self.use_aligner = self.args.use_aligner
         self.use_pitch = self.args.use_pitch
-        self.use_binary_alignment_loss = False
+        self.binary_loss_weight = 0.0
 
         self.length_scale = (
             float(self.args.length_scale) if isinstance(self.args.length_scale, int) else self.args.length_scale
@@ -644,8 +644,9 @@ def train_step(self, batch: dict, criterion: nn.Module):
                 pitch_target=outputs["pitch_avg_gt"] if self.use_pitch else None,
                 input_lens=text_lengths,
                 alignment_logprob=outputs["alignment_logprob"] if self.use_aligner else None,
-                alignment_soft=outputs["alignment_soft"] if self.use_binary_alignment_loss else None,
-                alignment_hard=outputs["alignment_mas"] if self.use_binary_alignment_loss else None,
+                alignment_soft=outputs["alignment_soft"],
+                alignment_hard=outputs["alignment_mas"],
+                binary_loss_weight=self.binary_loss_weight
             )
             # compute duration error
             durations_pred = outputs["durations"]
@@ -672,17 +673,12 @@ def _create_logs(self, batch, outputs, ap):
 
         # plot pitch figures
         if self.args.use_pitch:
-            pitch = batch["pitch"]
-            pitch_avg_expanded, _ = self.expand_encoder_outputs(
-                outputs["pitch_avg"], outputs["durations"], outputs["x_mask"], outputs["y_mask"]
-            )
-            pitch = pitch[0, 0].data.cpu().numpy()
-            # TODO: denormalize before plotting
-            pitch = abs(pitch)
-            pitch_avg_expanded = abs(pitch_avg_expanded[0, 0]).data.cpu().numpy()
+            pitch_avg = abs(outputs["pitch_avg_gt"][0, 0].data.cpu().numpy())
+            pitch_avg_hat = abs(outputs["pitch_avg"][0, 0].data.cpu().numpy())
+            chars = self.tokenizer.decode(batch["text_input"][0].data.cpu().numpy())
             pitch_figures = {
-                "pitch_ground_truth": plot_pitch(pitch, gt_spec, ap, output_fig=False),
-                "pitch_avg_predicted": plot_pitch(pitch_avg_expanded, pred_spec, ap, output_fig=False),
+                "pitch_ground_truth": plot_avg_pitch(pitch_avg, chars, output_fig=False),
+                "pitch_avg_predicted": plot_avg_pitch(pitch_avg_hat, chars, output_fig=False),
             }
             figures.update(pitch_figures)
 
@@ -725,9 +721,8 @@ def get_criterion(self):
         return ForwardTTSLoss(self.config)
 
     def on_train_step_start(self, trainer):
-        """Enable binary alignment loss when needed"""
-        if trainer.total_steps_done > self.config.binary_align_loss_start_step:
-            self.use_binary_alignment_loss = True
+        """Schedule binary loss weight."""
+        self.binary_loss_weight = min(trainer.epochs_done / self.config.binary_loss_warmup_epochs, 1.0) * 1.0
 
     @staticmethod
     def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None):

From c2d5be53886984d023b4f6e26dadb24d3e8b3a07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 09:28:48 +0000
Subject: [PATCH 59/67] Fix dataset preprocessing

---
 TTS/tts/datasets/dataset.py | 67 ++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 35 deletions(-)

diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 99d9429937..10fd1696ae 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -1,4 +1,5 @@
 import collections
+from email.mime import audio
 import os
 import random
 from typing import Dict, List, Union
@@ -140,8 +141,6 @@ def __init__(
         self.pitch_computed = False
         self.tokenizer = tokenizer
 
-        self.audio_lengths, self.text_lengths = self.compute_lengths(self.samples)
-
         if self.tokenizer.use_phonemes:
             self.phoneme_dataset = PhonemeDataset(
                 self.samples, self.tokenizer, phoneme_cache_path, precompute_num_workers=precompute_num_workers
@@ -254,16 +253,14 @@ def load_data(self, idx):
         return sample
 
     @staticmethod
-    def compute_lengths(samples):
-        audio_lengths = []
-        text_lengths = []
+    def _compute_lengths(samples):
+        new_samples = []
         for item in samples:
             text, wav_file, *_ = _parse_sample(item)
-            audio_lengths.append(os.path.getsize(wav_file) / 16 * 8)  # assuming 16bit audio
-            text_lengths.append(len(text))
-        audio_lengths = np.array(audio_lengths)
-        text_lengths = np.array(text_lengths)
-        return audio_lengths, text_lengths
+            audio_length = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio
+            text_lenght = len(text)
+            new_samples += [item + [audio_length, text_lenght]]
+        return new_samples
 
     @staticmethod
     def filter_by_length(lengths: List[int], min_len: int, max_len: int):
@@ -279,8 +276,9 @@ def filter_by_length(lengths: List[int], min_len: int, max_len: int):
         return ignore_idx, keep_idx
 
     @staticmethod
-    def sort_by_length(lengths: List[int]):
-        idxs = np.argsort(lengths)  # ascending order
+    def sort_by_length(samples: List[List]):
+        audio_lengths = [s[-2] for s in samples]
+        idxs = np.argsort(audio_lengths)  # ascending order
         return idxs
 
     @staticmethod
@@ -294,39 +292,38 @@ def create_buckets(samples, batch_group_size: int):
             samples[offset:end_offset] = temp_items
         return samples
 
-    def select_samples_by_idx(self, idxs):
-        samples = []
-        audio_lengths = []
-        text_lengths = []
+    def _select_samples_by_idx(self, idxs, samples):
+        samples_new = []
         for idx in idxs:
-            samples.append(self.samples[idx])
-            audio_lengths.append(self.audio_lengths[idx])
-            text_lengths.append(self.text_lengths[idx])
-        return samples, audio_lengths, text_lengths
+            samples_new.append(samples[idx])
+        return samples_new
 
     def preprocess_samples(self):
         r"""Sort `items` based on text length or audio length in ascending order. Filter out samples out or the length
         range.
         """
+        samples = self._compute_lengths(self.samples)
 
         # sort items based on the sequence length in ascending order
-        text_ignore_idx, text_keep_idx = self.filter_by_length(self.text_lengths, self.min_text_len, self.max_text_len)
+        text_lengths = [i[-1] for i in samples]
+        audio_lengths = [i[-2] for i in samples]
+        text_ignore_idx, text_keep_idx = self.filter_by_length(text_lengths, self.min_text_len, self.max_text_len)
         audio_ignore_idx, audio_keep_idx = self.filter_by_length(
-            self.audio_lengths, self.min_audio_len, self.max_audio_len
+            audio_lengths, self.min_audio_len, self.max_audio_len
         )
-        keep_idx = list(set(audio_keep_idx) | set(text_keep_idx))
+        keep_idx = list(set(audio_keep_idx) & set(text_keep_idx))
         ignore_idx = list(set(audio_ignore_idx) | set(text_ignore_idx))
 
-        samples, audio_lengths, _ = self.select_samples_by_idx(keep_idx)
+        samples = self._select_samples_by_idx(keep_idx, samples)
 
-        sorted_idxs = self.sort_by_length(audio_lengths)
+        sorted_idxs = self.sort_by_length(samples)
 
         if self.start_by_longest:
             longest_idxs = sorted_idxs[-1]
             sorted_idxs[-1] = sorted_idxs[0]
             sorted_idxs[0] = longest_idxs
 
-        samples, audio_lengths, text_lengtsh = self.select_samples_by_idx(sorted_idxs)
+        samples = self._select_samples_by_idx(sorted_idxs, samples)
 
         if len(samples) == 0:
             raise RuntimeError(" [!] No samples left")
@@ -338,19 +335,19 @@ def preprocess_samples(self):
             samples = self.create_buckets(samples, self.batch_group_size)
 
         # update items to the new sorted items
-        self.samples = samples
-        self.audio_lengths = audio_lengths
-        self.text_lengths = text_lengtsh
+        audio_lengths = [s[-2] for s in samples]
+        text_lengths = [s[-1] for s in samples]
+        self.samples = [s[:-2] for s in samples]
 
         if self.verbose:
             print(" | > Preprocessing samples")
-            print(" | > Max text length: {}".format(np.max(self.text_lengths)))
-            print(" | > Min text length: {}".format(np.min(self.text_lengths)))
-            print(" | > Avg text length: {}".format(np.mean(self.text_lengths)))
+            print(" | > Max text length: {}".format(np.max(text_lengths)))
+            print(" | > Min text length: {}".format(np.min(text_lengths)))
+            print(" | > Avg text length: {}".format(np.mean(text_lengths)))
             print(" | ")
-            print(" | > Max audio length: {}".format(np.max(self.audio_lengths)))
-            print(" | > Min audio length: {}".format(np.min(self.audio_lengths)))
-            print(" | > Avg audio length: {}".format(np.mean(self.audio_lengths)))
+            print(" | > Max audio length: {}".format(np.max(audio_lengths)))
+            print(" | > Min audio length: {}".format(np.min(audio_lengths)))
+            print(" | > Avg audio length: {}".format(np.mean(audio_lengths)))
             print(f" | > Num. instances discarded samples: {len(ignore_idx)}")
             print(" | > Batch group size: {}.".format(self.batch_group_size))
 

From ad983065f71fc96591a92c1086a4fce59eb9359e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 09:29:21 +0000
Subject: [PATCH 60/67] Update FastPitchConfig

---
 TTS/tts/configs/fast_pitch_config.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py
index 8f0631028a..de87038807 100644
--- a/TTS/tts/configs/fast_pitch_config.py
+++ b/TTS/tts/configs/fast_pitch_config.py
@@ -89,12 +89,9 @@ class FastPitchConfig(BaseTTSConfig):
         pitch_loss_alpha (float):
             Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
 
-        binary_loss_alpha (float):
+        binary_align_loss_alpha (float):
             Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
 
-        binary_align_loss_start_step (int):
-            Start binary alignment loss after this many steps. Defaults to 20000.
-
         min_seq_len (int):
             Minimum input sequence length to be used at training.
 
@@ -129,12 +126,12 @@ class FastPitchConfig(BaseTTSConfig):
     duration_loss_type: str = "mse"
     use_ssim_loss: bool = True
     ssim_loss_alpha: float = 1.0
-    dur_loss_alpha: float = 1.0
     spec_loss_alpha: float = 1.0
-    pitch_loss_alpha: float = 1.0
     aligner_loss_alpha: float = 1.0
-    binary_align_loss_alpha: float = 1.0
-    binary_align_loss_start_step: int = 20000
+    pitch_loss_alpha: float = 0.1
+    dur_loss_alpha: float = 0.1
+    binary_align_loss_alpha: float = 0.1
+    binary_loss_warmup_epochs: int = 150
 
     # overrides
     min_seq_len: int = 13

From f966a459e77ef06f16c045c47e1579106ac9a9da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 10:40:29 +0000
Subject: [PATCH 61/67] Make style

---
 TTS/bin/find_unique_phonemes.py |  1 -
 TTS/config/shared_configs.py    |  9 +++++++++
 TTS/tts/datasets/dataset.py     | 10 ++++------
 TTS/tts/layers/losses.py        |  6 ++++--
 TTS/tts/models/forward_tts.py   |  2 +-
 TTS/tts/models/vits.py          | 10 ++++++----
 TTS/tts/utils/helpers.py        | 12 ++++++++----
 TTS/tts/utils/languages.py      |  3 ++-
 TTS/tts/utils/visual.py         |  2 +-
 TTS/utils/synthesizer.py        |  4 +---
 TTS/vocoder/models/gan.py       |  4 ++--
 tests/data_tests/test_loader.py |  2 +-
 tests/tts_tests/test_helpers.py |  6 +++---
 13 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py
index e84c17de2f..10c7110750 100644
--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@@ -9,7 +9,6 @@
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
 
-
 phonemizer = Gruut(language="en-us")
 
 
diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py
index 217282adb0..392f10af56 100644
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@@ -57,6 +57,12 @@ class BaseAudioConfig(Coqpit):
         do_amp_to_db_mel (bool, optional):
             enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
 
+        pitch_fmax (float, optional):
+            Maximum frequency of the F0 frames. Defaults to ```640```.
+
+        pitch_fmin (float, optional):
+            Minimum frequency of the F0 frames. Defaults to ```0```.
+
         trim_db (int):
             Silence threshold used for silence trimming. Defaults to 45.
 
@@ -135,6 +141,9 @@ class BaseAudioConfig(Coqpit):
     spec_gain: int = 20
     do_amp_to_db_linear: bool = True
     do_amp_to_db_mel: bool = True
+    # f0 params
+    pitch_fmax: float = 640.0
+    pitch_fmin: float = 0.0
     # normalization params
     signal_norm: bool = True
     min_level_db: int = -100
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 10fd1696ae..0bcc554c51 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -1,5 +1,4 @@
 import collections
-from email.mime import audio
 import os
 import random
 from typing import Dict, List, Union
@@ -257,7 +256,7 @@ def _compute_lengths(samples):
         new_samples = []
         for item in samples:
             text, wav_file, *_ = _parse_sample(item)
-            audio_length = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio
+            audio_length = os.path.getsize(wav_file) / 16 * 8  # assuming 16bit audio
             text_lenght = len(text)
             new_samples += [item + [audio_length, text_lenght]]
         return new_samples
@@ -292,7 +291,8 @@ def create_buckets(samples, batch_group_size: int):
             samples[offset:end_offset] = temp_items
         return samples
 
-    def _select_samples_by_idx(self, idxs, samples):
+    @staticmethod
+    def _select_samples_by_idx(idxs, samples):
         samples_new = []
         for idx in idxs:
             samples_new.append(samples[idx])
@@ -308,9 +308,7 @@ def preprocess_samples(self):
         text_lengths = [i[-1] for i in samples]
         audio_lengths = [i[-2] for i in samples]
         text_ignore_idx, text_keep_idx = self.filter_by_length(text_lengths, self.min_text_len, self.max_text_len)
-        audio_ignore_idx, audio_keep_idx = self.filter_by_length(
-            audio_lengths, self.min_audio_len, self.max_audio_len
-        )
+        audio_ignore_idx, audio_keep_idx = self.filter_by_length(audio_lengths, self.min_audio_len, self.max_audio_len)
         keep_idx = list(set(audio_keep_idx) & set(text_keep_idx))
         ignore_idx = list(set(audio_ignore_idx) | set(text_ignore_idx))
 
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index 75320d1078..b7c8f6e458 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -740,7 +740,7 @@ def forward(
         alignment_logprob=None,
         alignment_hard=None,
         alignment_soft=None,
-        binary_loss_weight=None
+        binary_loss_weight=None,
     ):
         loss = 0
         return_dict = {}
@@ -774,7 +774,9 @@ def forward(
             binary_alignment_loss = self._binary_alignment_loss(alignment_hard, alignment_soft)
             loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss
             if binary_loss_weight:
-                return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight
+                return_dict["loss_binary_alignment"] = (
+                    self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight
+                )
             else:
                 return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss
 
diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py
index bb8640a3dd..8d554f767f 100644
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@@ -646,7 +646,7 @@ def train_step(self, batch: dict, criterion: nn.Module):
                 alignment_logprob=outputs["alignment_logprob"] if self.use_aligner else None,
                 alignment_soft=outputs["alignment_soft"],
                 alignment_hard=outputs["alignment_mas"],
-                binary_loss_weight=self.binary_loss_weight
+                binary_loss_weight=self.binary_loss_weight,
             )
             # compute duration error
             durations_pred = outputs["durations"]
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 301ddfcd6e..acd5c1729d 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -361,7 +361,9 @@ def __init__(
             )
 
         upsample_rate = math.prod(self.args.upsample_rates_decoder)
-        assert upsample_rate == self.config.audio.hop_length, f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {self.config.audio.hop_length}"
+        assert (
+            upsample_rate == self.config.audio.hop_length
+        ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {self.config.audio.hop_length}"
         self.waveform_decoder = HifiganGenerator(
             self.args.hidden_channels,
             1,
@@ -671,7 +673,7 @@ def forward(
             waveform,
             slice_ids * self.config.audio.hop_length,
             self.args.spec_segment_size * self.config.audio.hop_length,
-            pad_short=True
+            pad_short=True,
         )
 
         if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None:
@@ -693,7 +695,7 @@ def forward(
         outputs.update(
             {
                 "model_outputs": o,
-                "alignments" : attn.squeeze(1),
+                "alignments": attn.squeeze(1),
                 "m_p": m_p,
                 "logs_p": logs_p,
                 "z": z,
@@ -949,7 +951,7 @@ def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int):
         return self.train_step(batch, criterion, optimizer_idx)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        figures, audios =  self._log(self.ap, batch, outputs, "eval")
+        figures, audios = self._log(self.ap, batch, outputs, "eval")
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py
index 3251337768..c2e7f56146 100644
--- a/TTS/tts/utils/helpers.py
+++ b/TTS/tts/utils/helpers.py
@@ -68,7 +68,7 @@ def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_
     """
     # pad the input tensor if it is shorter than the segment size
     if pad_short and x.shape[-1] < segment_size:
-        x = torch.nn.functional.pad(x, (0,  segment_size - x.size(2)))
+        x = torch.nn.functional.pad(x, (0, segment_size - x.size(2)))
 
     segments = torch.zeros_like(x[:, :, :segment_size])
 
@@ -78,12 +78,14 @@ def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_
         x_i = x[i]
         if pad_short and index_end > x.size(2):
             # pad the sample if it is shorter than the segment size
-             x_i = torch.nn.functional.pad(x_i, (0,  (index_end + 1) - x.size(2)))
+            x_i = torch.nn.functional.pad(x_i, (0, (index_end + 1) - x.size(2)))
         segments[i] = x_i[:, index_start:index_end]
     return segments
 
 
-def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4, let_short_samples=False, pad_short=False):
+def rand_segments(
+    x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4, let_short_samples=False, pad_short=False
+):
     """Create random segments based on the input lengths.
 
     Args:
@@ -110,7 +112,9 @@ def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=
         _x_lenghts[len_diff < 0] = segment_size
         len_diff = _x_lenghts - segment_size + 1
     else:
-        assert all(len_diff > 0), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
+        assert all(
+            len_diff > 0
+        ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
     segment_indices = (torch.rand([B]).type_as(x) * len_diff).long()
     ret = segment(x, segment_indices, segment_size)
     return ret, segment_indices
diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py
index 8f14d71735..6c1f63f087 100644
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@@ -1,7 +1,6 @@
 import json
 import os
 from typing import Dict, List
-from TTS.config import check_config_and_model_args
 
 import fsspec
 import numpy as np
@@ -9,6 +8,8 @@
 from coqpit import Coqpit
 from torch.utils.data.sampler import WeightedRandomSampler
 
+from TTS.config import check_config_and_model_args
+
 
 class LanguageManager:
     """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
diff --git a/TTS/tts/utils/visual.py b/TTS/tts/utils/visual.py
index 4fd1f19cb8..78c1298109 100644
--- a/TTS/tts/utils/visual.py
+++ b/TTS/tts/utils/visual.py
@@ -104,7 +104,7 @@ def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False):
     fig, ax = plt.subplots()
 
     x = np.array(range(len(chars)))
-    my_xticks = [c for c in chars]
+    my_xticks = chars
     plt.xticks(x, my_xticks)
 
     ax.set_xlabel("characters")
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index ddc2a6a545..6821e975b0 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -5,10 +5,8 @@
 import pysbd
 import torch
 
-from TTS.config import check_config_and_model_args, get_from_config_or_model_args_with_default, load_config
+from TTS.config import load_config
 from TTS.tts.models import setup_model as setup_tts_model
-from TTS.tts.utils.languages import LanguageManager
-from TTS.tts.utils.speakers import SpeakerManager
 
 # pylint: disable=unused-wildcard-import
 # pylint: disable=wildcard-import
diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index 7e03e94f2e..6978f0e798 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -19,7 +19,7 @@
 
 
 class GAN(BaseVocoder):
-    def __init__(self, config: Coqpit, ap: AudioProcessor=None):
+    def __init__(self, config: Coqpit, ap: AudioProcessor = None):
         """Wrap a generator and a discriminator network. It provides a compatible interface for the trainer.
         It also helps mixing and matching different generator and disciminator networks easily.
 
@@ -306,7 +306,7 @@ def format_batch(batch: List) -> Dict:
         x, y = batch
         return {"input": x, "waveform": y}
 
-    def get_data_loader(  # pylint: disable=no-self-use
+    def get_data_loader(  # pylint: disable=no-self-use, unused-argument
         self,
         config: Coqpit,
         assets: Dict,
diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
index 477ee71fb0..75245ab8b7 100644
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@@ -52,7 +52,7 @@ def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):
             max_text_len=c.max_text_len,
             min_audio_len=c.min_audio_len,
             max_audio_len=c.max_audio_len,
-            start_by_longest=start_by_longest
+            start_by_longest=start_by_longest,
         )
         dataloader = DataLoader(
             dataset,
diff --git a/tests/tts_tests/test_helpers.py b/tests/tts_tests/test_helpers.py
index 708ecbf50e..23bb440a0a 100644
--- a/tests/tts_tests/test_helpers.py
+++ b/tests/tts_tests/test_helpers.py
@@ -1,6 +1,6 @@
 import torch as T
 
-from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask, rand_segments
+from TTS.tts.utils.helpers import average_over_durations, generate_path, rand_segments, segment, sequence_mask
 
 
 def average_over_durations_test():  # pylint: disable=no-self-use
@@ -57,12 +57,12 @@ def rand_segments_test():
     assert segments.shape == (2, 3, 3)
     assert all(seg_idxs >= 0), seg_idxs
     try:
-        segments, _  = rand_segments(x, x_lens, segment_size=5)
+        segments, _ = rand_segments(x, x_lens, segment_size=5)
         raise Exception("Should have failed")
     except:
         pass
     x_lens_back = x_lens.clone()
-    segments, seg_idxs= rand_segments(x, x_lens.clone(), segment_size=5, pad_short=True, let_short_samples=True)
+    segments, seg_idxs = rand_segments(x, x_lens.clone(), segment_size=5, pad_short=True, let_short_samples=True)
     assert segments.shape == (2, 3, 5)
     assert all(seg_idxs >= 0), seg_idxs
     assert all(x_lens_back == x_lens)

From 153c875d79d5f0e92bdff0b13e66bb3c68cfa589 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 25 Jan 2022 10:41:20 +0000
Subject: [PATCH 62/67] Update AnalyzeDataset notebook

---
 .../dataset_analysis/AnalyzeDataset.ipynb     | 76 ++++++++++++-------
 1 file changed, 50 insertions(+), 26 deletions(-)

diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb
index c2aabbf96a..e08f3ab356 100644
--- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb
+++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb
@@ -8,7 +8,7 @@
    },
    "outputs": [],
    "source": [
-    "TTS_PATH = \"/home/erogol/projects/\""
+    "# TTS_PATH = \"/home/erogol/projects/\""
    ]
   },
   {
@@ -21,7 +21,6 @@
    "source": [
     "import os\n",
     "import sys\n",
-    "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
     "import librosa\n",
     "import numpy as np\n",
     "import pandas as pd\n",
@@ -30,6 +29,8 @@
     "from multiprocessing import Pool\n",
     "from matplotlib import pylab as plt\n",
     "from collections import Counter\n",
+    "from TTS.config.shared_configs import BaseDatasetConfig\n",
+    "from TTS.tts.datasets import load_tts_samples\n",
     "from TTS.tts.datasets.formatters import *\n",
     "%matplotlib inline"
    ]
@@ -42,22 +43,29 @@
    },
    "outputs": [],
    "source": [
-    "DATA_PATH = \"/home/erogol/Data/m-ai-labs/de_DE/by_book/male/karlsson/\"\n",
-    "META_DATA = [\"kleinzaches/metadata.csv\",\n",
-    "            \"spiegel_kaetzchen/metadata.csv\",\n",
-    "            \"herrnarnesschatz/metadata.csv\",\n",
-    "            \"maedchen_von_moorhof/metadata.csv\",\n",
-    "            \"koenigsgaukler/metadata.csv\",\n",
-    "            \"altehous/metadata.csv\",\n",
-    "            \"odysseus/metadata.csv\",\n",
-    "            \"undine/metadata.csv\",\n",
-    "            \"reise_tilsit/metadata.csv\",\n",
-    "            \"schmied_seines_glueckes/metadata.csv\",\n",
-    "            \"kammmacher/metadata.csv\",\n",
-    "            \"unterm_birnbaum/metadata.csv\",\n",
-    "            \"liebesbriefe/metadata.csv\",\n",
-    "            \"sandmann/metadata.csv\"]\n",
-    "NUM_PROC = 8"
+    "NUM_PROC = 8\n",
+    "DATASET_CONFIG = BaseDatasetConfig(\n",
+    "    name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/home/ubuntu/TTS/depot/data/male_dataset1_44k/\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument\n",
+    "    txt_file = os.path.join(root_path, meta_file)\n",
+    "    items = []\n",
+    "    speaker_name = \"maledataset1\"\n",
+    "    with open(txt_file, \"r\", encoding=\"utf-8\") as ttf:\n",
+    "        for line in ttf:\n",
+    "            cols = line.split(\"|\")\n",
+    "            wav_file = os.path.join(root_path, \"wavs\", cols[0])\n",
+    "            text = cols[1]\n",
+    "            items.append([text, wav_file, speaker_name])\n",
+    "    return items"
    ]
   },
   {
@@ -69,8 +77,10 @@
    "outputs": [],
    "source": [
     "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n",
-    "items = mailabs(DATA_PATH, META_DATA)\n",
-    "print(\" > Number of audio files: {}\".format(len(items)))"
+    "train_samples, eval_samples = load_tts_samples(DATASET_CONFIG, eval_split=True, formatter=formatter)\n",
+    "items = train_samples + eval_samples\n",
+    "print(\" > Number of audio files: {}\".format(len(items)))\n",
+    "print(items[1])"
    ]
   },
   {
@@ -103,6 +113,15 @@
     "print([item for item, count in c.items() if count > 1])"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "item"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -112,11 +131,9 @@
    "outputs": [],
    "source": [
     "def load_item(item):\n",
-    "    file_name = item[1].strip()\n",
     "    text = item[0].strip()\n",
-    "    audio = librosa.load(file_name, sr=None)\n",
-    "    sr = audio[1]\n",
-    "    audio = audio[0]\n",
+    "    file_name = item[1].strip()\n",
+    "    audio, sr = librosa.load(file_name, sr=None)\n",
     "    audio_len = len(audio) / sr\n",
     "    text_len = len(text)\n",
     "    return file_name, text, text_len, audio, audio_len\n",
@@ -374,11 +391,18 @@
     "# fequency bar plot - it takes time!!\n",
     "w_count_df.plot.bar()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -392,7 +416,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,

From f91220690b7fb7fcaacf900eaec95cda1309c25f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 28 Jan 2022 10:20:07 +0100
Subject: [PATCH 63/67] Load right char class dynamically

---
 TTS/tts/utils/text/tokenizer.py | 21 ++++++++++++++++-----
 TTS/utils/generic_utils.py      | 27 +++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
index 80be368d48..bdaf8ea64b 100644
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@@ -3,6 +3,7 @@
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
 from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
+from TTS.utils.generic_utils import get_import_path, import_class
 
 
 class TTSTokenizer:
@@ -152,15 +153,25 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
 
         # init characters
         if characters is None:
-            if config.use_phonemes:
-                # init phoneme set
-                characters, new_config = IPAPhonemes().init_from_config(config)
+            # set characters based on defined characters class
+            if config.characters and config.characters.characters_class:
+                CharactersClass = import_class(config.characters.characters_class)
+                characters, new_config = CharactersClass.init_from_config(config)
+            # set characters based on config
             else:
-                # init character set
-                characters, new_config = Graphemes().init_from_config(config)
+                if config.use_phonemes:
+                    # init phoneme set
+                    characters, new_config = IPAPhonemes().init_from_config(config)
+                else:
+                    # init character set
+                    characters, new_config = Graphemes().init_from_config(config)
+
         else:
             characters, new_config = characters.init_from_config(config)
 
+        # set characters class
+        new_config.characters.characters_class = get_import_path(characters)
+
         # init phonemizer
         phonemizer = None
         if config.use_phonemes:
diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py
index 6504cca622..69609bcbf3 100644
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@@ -95,6 +95,33 @@ def find_module(module_path: str, module_name: str) -> object:
     return getattr(module, class_name)
 
 
+def import_class(module_path: str) -> object:
+    """Import a class from a module path.
+
+    Args:
+        module_path (str): The module path of the class.
+
+    Returns:
+        object: The imported class.
+    """
+    class_name = module_path.split(".")[-1]
+    module_path = ".".join(module_path.split(".")[:-1])
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+
+
+def get_import_path(obj: object) -> str:
+    """Get the import path of a class.
+
+    Args:
+        obj (object): The class object.
+
+    Returns:
+        str: The import path of the class.
+    """
+    return ".".join([type(obj).__module__, type(obj).__name__])
+
+
 def get_user_data_dir(appname):
     if sys.platform == "win32":
         import winreg  # pylint: disable=import-outside-toplevel

From 0b8acaf81c3e91a768d43bae4d3af825ec11a2b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 28 Jan 2022 10:22:12 +0100
Subject: [PATCH 64/67] Add new speakers to the vits model

---
 TTS/tts/models/vits.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index acd5c1729d..be24bcafeb 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -654,12 +654,7 @@ def forward(
         z_p = self.flow(z, y_mask, g=g)
 
         # duration predictor
-        if self.args.use_mas:
-            outputs, attn = self.forward_mas(outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g=g, lang_emb=lang_emb)
-        elif self.args.use_aligner_network:
-            outputs, attn = self.forward_aligner(outputs, m_p, z_p, x_mask, y_mask, g=g, lang_emb=lang_emb)
-            outputs["x_lens"] = x_lengths
-            outputs["y_lens"] = y_lengths
+        outputs, attn = self.forward_mas(outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g=g, lang_emb=lang_emb)
 
         # expand prior
         m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p])
@@ -1057,7 +1052,15 @@ def load_checkpoint(
         # TODO: consider baking the speaker encoder into the model and call it from there.
         # as it is probably easier for model distribution.
         state["model"] = {k: v for k, v in state["model"].items() if "speaker_encoder" not in k}
-        self.load_state_dict(state["model"])
+        # handle fine-tuning from a checkpoint with additional speakers
+        if state["model"]["emb_g.weight"].shape != self.emb_g.weight.shape:
+            print(" > Loading checkpoint with additional speakers.")
+            emb_g = state["model"]["emb_g.weight"]
+            new_row = torch.zeros(1, emb_g.shape[1])
+            emb_g = torch.cat([emb_g, new_row], axis=0)
+            state["model"]["emb_g.weight"] = emb_g
+
+        self.load_state_dict(state["model"], strict=False)
         if eval:
             self.eval()
             assert not self.training

From a164485ab709e48999e2871561d793849133a884 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 28 Jan 2022 10:23:22 +0100
Subject: [PATCH 65/67] Fix up

---
 TTS/tts/configs/shared_configs.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index 09266ce2e7..3c450cea64 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -53,6 +53,10 @@ class CharactersConfig(Coqpit):
     """Defines arguments for the `BaseCharacters` and its subclasses.
 
     Args:
+        characters_class (str):
+            Defines the class of the characters used. If None, we pick ```Phonemes``` or ```Graphemes``` based on
+            the configuration. Defaults to None.
+
         pad (str):
             characters in place of empty padding. Defaults to None.
 
@@ -84,6 +88,7 @@ class CharactersConfig(Coqpit):
             Sort the characters in alphabetical order. Defaults to True.
     """
 
+    characters_class: str = None
     pad: str = None
     eos: str = None
     bos: str = None

From 6c55245b2854bf5cfbceb84c5cb75ed9a9e517ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 28 Jan 2022 10:23:52 +0100
Subject: [PATCH 66/67] Fix VCTK VITS recipe

---
 recipes/vctk/vits/train_vits.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py
index 2906557dde..caf1caa100 100644
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@@ -57,9 +57,7 @@
     print_step=25,
     print_eval=False,
     mixed_precision=True,
-    sort_by_audio_len=True,
-    min_seq_len=32 * 256 * 4,
-    max_seq_len=1500000,
+    max_text_len= 325,  # change this if you have a larger VRAM than 16GB
     output_path=output_path,
     datasets=[dataset_config],
 )

From a68fb7667757020068ea189a837d0f40979c84ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 28 Jan 2022 13:50:58 +0100
Subject: [PATCH 67/67] Set `drop_last`

---
 TTS/tts/models/base_tts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 7cdfa915df..0eb2b5f311 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -324,9 +324,9 @@ def get_data_loader(
             loader = DataLoader(
                 dataset,
                 batch_size=config.eval_batch_size if is_eval else config.batch_size,
-                shuffle=False,
+                shuffle=False,      # shuffle is done in the dataset.
                 collate_fn=dataset.collate_fn,
-                drop_last=False,
+                drop_last=True,     # setting this False might cause issues in AMP training.
                 sampler=sampler,
                 num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
                 pin_memory=False,