From c971e065db80d4f9561867ab1d9924e3d9746e15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Nov 2021 13:29:57 +0100 Subject: [PATCH 01/67] Refactor Synthesizer class for TTSTokenizer --- TTS/utils/synthesizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2e4f4735bc..a06a493fe9 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -114,7 +114,8 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) - self.tts_config = load_config(tts_config_path) self.use_phonemes = self.tts_config.use_phonemes - self.tts_model = setup_tts_model(config=self.tts_config) + self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) + self.tokenizer = TTSTokenizer.init_from_config(self.tts_config) speaker_manager = self._init_speaker_manager() language_manager = self._init_language_manager() @@ -332,6 +333,8 @@ def tts( text=sen, CONFIG=self.tts_config, use_cuda=self.use_cuda, + ap=self.ap, + tokenizer=self.tokenizer, speaker_id=speaker_id, language_id=language_id, language_name=language_name, From c9142eb47f407bda1e8a4ad8bed83f544fd911ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Nov 2021 13:33:21 +0100 Subject: [PATCH 02/67] Refactor TTSDataset to use TTSTokenizer --- TTS/tts/datasets/dataset.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index bd7022e35b..e71cdb67c0 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -69,6 +69,9 @@ def __init__( samples (list): List of dataset instances. + tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else + use the given. Defaults to None. + tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else use the given. Defaults to None. @@ -202,6 +205,20 @@ def get_token_ids(self, idx, text): token_ids = self.tokenizer.text_to_ids(text) return np.array(token_ids, dtype=np.int32) + @staticmethod + def _parse_sample(item): + language_name = None + attn_file = None + if len(item) == 5: + text, wav_file, speaker_name, language_name, attn_file = item + elif len(item) == 4: + text, wav_file, speaker_name, language_name = item + elif len(item) == 3: + text, wav_file, speaker_name = item + else: + raise ValueError(" [!] Dataset cannot parse the sample.") + return text, wav_file, speaker_name, language_name, attn_file + def load_data(self, idx): item = self.samples[idx] From da13f46a04f329d70dd73725cae9185f80780026 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Nov 2021 13:34:45 +0100 Subject: [PATCH 03/67] Refactor synthesis.py for TTSTokenizer --- TTS/tts/utils/synthesis.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index c2efdcba70..10eb55a65c 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -175,11 +175,8 @@ def synthesis( text, CONFIG, use_cuda, -<<<<<<< HEAD -======= ap, tokenizer, ->>>>>>> Refactor synthesis.py for TTSTokenizer speaker_id=None, style_wav=None, use_griffin_lim=False, From 2588e8290a32dfbb320bff4907f4c41eece91cb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Nov 2021 13:36:35 +0100 Subject: [PATCH 04/67] Refactor GlowTTS model and recipe for TTSTokenizer --- TTS/tts/models/base_tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 272317905b..64086a847d 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -287,7 +287,7 @@ def get_data_loader( verbose=verbose, speaker_id_mapping=speaker_id_mapping, d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, - tokenizer=self.tokenizer, + tokenizer=self.tokenizer ) # wait all the DDP process to be ready From e1db18045c4b593ada230b097cdb15421c7c2c60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 17 Nov 2021 12:46:04 +0100 Subject: [PATCH 05/67] Update imports for symbols -> characters --- TTS/tts/models/base_tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 64086a847d..272317905b 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -287,7 +287,7 @@ def get_data_loader( verbose=verbose, speaker_id_mapping=speaker_id_mapping, d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, - tokenizer=self.tokenizer + tokenizer=self.tokenizer, ) # wait all the DDP process to be ready From 66cad5b5b4f62fd4696498b93719f3336fb61754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 24 Nov 2021 17:49:20 +0100 Subject: [PATCH 06/67] Update for tokenizer API --- TTS/utils/synthesizer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index a06a493fe9..2e4f4735bc 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -114,8 +114,7 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) - self.tts_config = load_config(tts_config_path) self.use_phonemes = self.tts_config.use_phonemes - self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) - self.tokenizer = TTSTokenizer.init_from_config(self.tts_config) + self.tts_model = setup_tts_model(config=self.tts_config) speaker_manager = self._init_speaker_manager() language_manager = self._init_language_manager() @@ -333,8 +332,6 @@ def tts( text=sen, CONFIG=self.tts_config, use_cuda=self.use_cuda, - ap=self.ap, - tokenizer=self.tokenizer, speaker_id=speaker_id, language_id=language_id, language_name=language_name, From 4884169264f8c4a9d3d125e7880afa2f093c9058 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 30 Nov 2021 15:50:18 +0100 Subject: [PATCH 07/67] =?UTF-8?q?Refactor=20TTSDataset=20=E2=9A=A1?= =?UTF-8?q?=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/tts/datasets/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index e71cdb67c0..0cf910467f 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -216,8 +216,8 @@ def _parse_sample(item): elif len(item) == 3: text, wav_file, speaker_name = item else: - raise ValueError(" [!] Dataset cannot parse the sample.") - return text, wav_file, speaker_name, language_name, attn_file + token_ids = self.tokenizer.text_to_ids(text) + return token_ids def load_data(self, idx): item = self.samples[idx] From 580b99e43c03307c8d9ec8a825f2c165710bb314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 30 Nov 2021 15:55:36 +0100 Subject: [PATCH 08/67] Refactorin VITS for the tokenizer API --- TTS/tts/models/vits.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index d7059da905..aa578ff8cd 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -272,10 +272,7 @@ class Vits(BaseTTS): # pylint: disable=dangerous-default-value def __init__( - self, - config: Coqpit, - speaker_manager: SpeakerManager = None, - language_manager: LanguageManager = None, + self, config: Coqpit, ap: "AudioProcessor", tokenizer: "TTSTokenizer", speaker_manager: SpeakerManager = None, language_manager: LanguageManager = None ): super().__init__(config, ap, tokenizer, speaker_manager) From 7c46d5ec83372074d8dd5a8540ef83da055a551c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 1 Dec 2021 10:06:02 +0100 Subject: [PATCH 09/67] Update data loader tests --- TTS/tts/datasets/dataset.py | 17 ----------------- tests/data_tests/test_loader.py | 1 - 2 files changed, 18 deletions(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 0cf910467f..bd7022e35b 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -69,9 +69,6 @@ def __init__( samples (list): List of dataset instances. - tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else - use the given. Defaults to None. - tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else use the given. Defaults to None. @@ -205,20 +202,6 @@ def get_token_ids(self, idx, text): token_ids = self.tokenizer.text_to_ids(text) return np.array(token_ids, dtype=np.int32) - @staticmethod - def _parse_sample(item): - language_name = None - attn_file = None - if len(item) == 5: - text, wav_file, speaker_name, language_name, attn_file = item - elif len(item) == 4: - text, wav_file, speaker_name, language_name = item - elif len(item) == 3: - text, wav_file, speaker_name = item - else: - token_ids = self.tokenizer.text_to_ids(text) - return token_ids - def load_data(self, idx): item = self.samples[idx] diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index a1d43b8176..ac850a1440 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -39,7 +39,6 @@ def __init__(self, *args, **kwargs): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") - tokenizer = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, From 033dedffbc79d7f8961ce40f65a5913bca0d92bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 08:56:57 +0000 Subject: [PATCH 10/67] Add init_from_config --- TTS/vocoder/models/base_vocoder.py | 1 + TTS/vocoder/models/wavegrad.py | 4 ++++ TTS/vocoder/models/wavernn.py | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/TTS/vocoder/models/base_vocoder.py b/TTS/vocoder/models/base_vocoder.py index 9d6ef26f6f..2728525cc5 100644 --- a/TTS/vocoder/models/base_vocoder.py +++ b/TTS/vocoder/models/base_vocoder.py @@ -20,6 +20,7 @@ class BaseVocoder(BaseModel): def __init__(self, config): super().__init__(config) + self._set_model_args(config) def _set_model_args(self, config: Coqpit): """Setup model args based on the config type. diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index ed4f4b37b4..f801715df7 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -339,3 +339,7 @@ def on_epoch_start(self, trainer): # pylint: disable=unused-argument noise_schedule = self.config["train_noise_schedule"] betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) self.compute_noise_level(betas) + + @staticmethod + def init_from_config(config: "WavegradConfig"): + return Wavegrad(config) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 1977efb687..5ce01782ef 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -631,3 +631,7 @@ def get_data_loader( # pylint: disable=no-self-use def get_criterion(self): # define train functions return WaveRNNLoss(self.args.mode) + + @staticmethod + def init_from_config(config: "WavernnConfig"): + return Wavernn(config) From 29ff0f6a376670073680fc7293496f9218873e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:51:58 +0000 Subject: [PATCH 11/67] Make lint --- TTS/tts/datasets/__init__.py | 4 +- TTS/tts/datasets/dataset.py | 177 ++++++++-------- TTS/tts/utils/synthesis.py | 4 +- TTS/tts/utils/text/characters.py | 199 +++++++++++------- TTS/tts/utils/text/phonemizers/base.py | 36 ++-- .../utils/text/phonemizers/espeak_wrapper.py | 65 ++++-- .../utils/text/phonemizers/gruut_wrapper.py | 3 +- .../text/phonemizers/ja_jp_phonemizer.py | 20 +- .../text/phonemizers/multi_phonemizer.py | 28 +-- .../text/phonemizers/zh_cn_phonemizer.py | 23 +- TTS/tts/utils/text/punctuation.py | 18 +- TTS/utils/audio.py | 3 +- TTS/utils/synthesizer.py | 1 - TTS/vocoder/models/gan.py | 2 +- 14 files changed, 333 insertions(+), 250 deletions(-) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 40eed7e365..4e8a2485db 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -88,8 +88,8 @@ def load_tts_samples( meta_data_eval_all += meta_data_eval meta_data_train_all += meta_data_train # load attention masks for the duration predictor training - if dataset.meta_file_attn_mask: - meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) + if d.meta_file_attn_mask: + meta_data = dict(load_attention_mask_meta_data(d["meta_file_attn_mask"])) for idx, ins in enumerate(meta_data_train_all): attn_file = meta_data[ins[1]].strip() meta_data_train_all[idx].append(attn_file) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index bd7022e35b..229f59c7a0 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -1,7 +1,6 @@ import collections import os import random -from multiprocessing import Pool from typing import Dict, List, Union import numpy as np @@ -10,7 +9,6 @@ from torch.utils.data import Dataset from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor -from TTS.tts.utils.text import TTSTokenizer from TTS.utils.audio import AudioProcessor @@ -183,7 +181,7 @@ def load_wav(self, filename): def get_phonemes(self, idx, text): out_dict = self.phoneme_dataset[idx] assert text == out_dict["text"], f"{text} != {out_dict['text']}" - assert out_dict["token_ids"].size > 0 + assert len(out_dict["token_ids"]) > 0 return out_dict def get_f0(self, idx): @@ -192,7 +190,8 @@ def get_f0(self, idx): assert wav_file == out_dict["audio_file"] return out_dict - def get_attn_maks(self, attn_file): + @staticmethod + def get_attn_mask(attn_file): return np.load(attn_file) def get_token_ids(self, idx, text): @@ -205,7 +204,7 @@ def get_token_ids(self, idx, text): def load_data(self, idx): item = self.samples[idx] - text, wav_file, speaker_name, language_name, attn_file = _parse_sample(item) + text, wav_file, speaker_name, _, attn_file = _parse_sample(item) raw_text = text wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) @@ -263,7 +262,7 @@ def filter_by_length(lengths: List[int], min_len: int, max_len: int): idxs = np.argsort(lengths) # ascending order ignore_idx = [] keep_idx = [] - for i, idx in enumerate(idxs): + for idx in idxs: length = lengths[idx] if length < min_len or length > max_len: ignore_idx.append(idx) @@ -278,6 +277,7 @@ def sort_by_length(lengths: List[int]): @staticmethod def create_buckets(samples, batch_group_size: int): + assert batch_group_size > 0 for i in range(len(samples) // batch_group_size): offset = i * batch_group_size end_offset = offset + batch_group_size @@ -320,7 +320,8 @@ def preprocess_samples(self): # shuffle batch groups # create batches with similar length items # the larger the `batch_group_size`, the higher the length variety in a batch. - samples = self.create_buckets(samples, self.batch_group_size) + if self.batch_group_size > 0: + samples = self.create_buckets(samples, self.batch_group_size) # update items to the new sorted items self.samples = samples @@ -572,6 +573,7 @@ def precompute(self, num_workers=1): We use pytorch dataloader because we are lazy. """ + print("[*] Pre-computing phonemes...") with tqdm.tqdm(total=len(self)) as pbar: batch_size = num_workers if num_workers > 0 else 1 dataloder = torch.utils.data.DataLoader( @@ -659,16 +661,21 @@ def __len__(self): return len(self.samples) def precompute(self, num_workers=0): + print("[*] Pre-computing F0s...") with tqdm.tqdm(total=len(self)) as pbar: batch_size = num_workers if num_workers > 0 else 1 + # we do not normalize at preproessing + normalize_f0 = self.normalize_f0 + self.normalize_f0 = False dataloder = torch.utils.data.DataLoader( batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn ) computed_data = [] for batch in dataloder: f0 = batch["f0"] - computed_data.append([f for f in f0]) + computed_data.append(f for f in f0) pbar.update(batch_size) + self.normalize_f0 = normalize_f0 if self.normalize_f0: computed_data = [tensor for batch in computed_data for tensor in batch] # flatten @@ -747,80 +754,80 @@ def print_logs(self, level: int = 0) -> None: print(f"{indent}| > Number of instances : {len(self.samples)}") -if __name__ == "__main__": - from torch.utils.data import DataLoader - - from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig - from TTS.tts.datasets import load_tts_samples - from TTS.tts.utils.text.characters import IPAPhonemes - from TTS.tts.utils.text.phonemizers import ESpeak - - dataset_config = BaseDatasetConfig( - name="ljspeech", - meta_file_train="metadata.csv", - path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1", - ) - train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) - samples = train_samples + eval_samples - - phonemizer = ESpeak(language="en-us") - tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer) - # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests") - # ph_dataset.precompute(num_workers=4) - - # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn) - # for batch in dataloader: - # print(batch) - # break - - audio_config = BaseAudioConfig( - sample_rate=22050, - win_length=1024, - hop_length=256, - num_mels=80, - preemphasis=0.0, - ref_level_db=20, - log_func="np.log", - do_trim_silence=True, - trim_db=45, - mel_fmin=0, - mel_fmax=8000, - spec_gain=1.0, - signal_norm=False, - do_amp_to_db_linear=False, - ) - - ap = AudioProcessor.init_from_config(audio_config) - - # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4) - - # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn) - # for batch in dataloader: - # print(batch) - # breakpoint() - # break - - dataset = TTSDataset( - outputs_per_step=1, - compute_linear_spec=False, - samples=samples, - ap=ap, - return_wav=False, - batch_group_size=0, - min_seq_len=0, - max_seq_len=500, - use_noise_augment=False, - verbose=True, - speaker_id_mapping=None, - d_vector_mapping=None, - compute_f0=True, - f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests", - tokenizer=tokenizer, - phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests", - precompute_num_workers=4, - ) - - dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn) - for batch in dataloader: - print(batch) - break +# if __name__ == "__main__": +# from torch.utils.data import DataLoader + +# from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig +# from TTS.tts.datasets import load_tts_samples +# from TTS.tts.utils.text.characters import IPAPhonemes +# from TTS.tts.utils.text.phonemizers import ESpeak + +# dataset_config = BaseDatasetConfig( +# name="ljspeech", +# meta_file_train="metadata.csv", +# path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1", +# ) +# train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +# samples = train_samples + eval_samples + +# phonemizer = ESpeak(language="en-us") +# tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer) +# # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests") +# # ph_dataset.precompute(num_workers=4) + +# # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn) +# # for batch in dataloader: +# # print(batch) +# # break + +# audio_config = BaseAudioConfig( +# sample_rate=22050, +# win_length=1024, +# hop_length=256, +# num_mels=80, +# preemphasis=0.0, +# ref_level_db=20, +# log_func="np.log", +# do_trim_silence=True, +# trim_db=45, +# mel_fmin=0, +# mel_fmax=8000, +# spec_gain=1.0, +# signal_norm=False, +# do_amp_to_db_linear=False, +# ) + +# ap = AudioProcessor.init_from_config(audio_config) + +# # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4) + +# # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn) +# # for batch in dataloader: +# # print(batch) +# # breakpoint() +# # break + +# dataset = TTSDataset( +# outputs_per_step=1, +# compute_linear_spec=False, +# samples=samples, +# ap=ap, +# return_wav=False, +# batch_group_size=0, +# min_seq_len=0, +# max_seq_len=500, +# use_noise_augment=False, +# verbose=True, +# speaker_id_mapping=None, +# d_vector_mapping=None, +# compute_f0=True, +# f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests", +# tokenizer=tokenizer, +# phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests", +# precompute_num_workers=4, +# ) + +# dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn) +# for batch in dataloader: +# print(batch) +# break diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 10eb55a65c..47ea0e934c 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -283,10 +283,10 @@ def synthesis( wav = model_outputs.squeeze(0) else: if use_griffin_lim: - wav = inv_spectrogram(model_outputs, ap, CONFIG) + wav = inv_spectrogram(model_outputs, model.ap, CONFIG) # trim silence if do_trim_silence: - wav = trim_silence(wav, ap) + wav = trim_silence(wav, model.ap) return_dict = { "wav": wav, "alignments": alignments, diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index 24ce51f1a8..aae6844ffa 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -1,3 +1,8 @@ +from dataclasses import replace + +from TTS.tts.configs.shared_configs import CharactersConfig + + def parse_symbols(): return { "pad": _pad, @@ -29,46 +34,49 @@ def parse_symbols(): _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics -def create_graphemes( - characters=_characters, - punctuations=_punctuations, - pad=_pad, - eos=_eos, - bos=_bos, - blank=_blank, - unique=True, -): # pylint: disable=redefined-outer-name - """Function to create default characters and phonemes""" - # create graphemes - _graphemes = list(characters) - _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes - _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes - _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes - _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes - _graphemes = _graphemes + list(punctuations) - return _graphemes, _phonemes - - -def create_phonemes( - phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True -): - # create phonemes - _phonemes = None - _phonemes_sorted = ( - sorted(list(set(phonemes))) if unique else sorted(list(phonemes)) - ) # this is to keep previous models compatible. - _phonemes = list(_phonemes_sorted) - _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes - _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes - _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes - _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes - _phonemes = _phonemes + list(punctuations) - _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) - return _phonemes - - -graphemes = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos) -phonemes = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank) +# def create_graphemes( +# characters=_characters, +# punctuations=_punctuations, +# pad=_pad, +# eos=_eos, +# bos=_bos, +# blank=_blank, +# unique=True, +# ): # pylint: disable=redefined-outer-name +# """Function to create default characters and phonemes""" +# # create graphemes +# = ( +# sorted(list(set(phonemes))) if unique else sorted(list(phonemes)) +# ) # this is to keep previous models compatible. +# _graphemes = list(characters) +# _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes +# _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes +# _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes +# _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes +# _graphemes = _graphemes + list(punctuations) +# return _graphemes, _phonemes + + +# def create_phonemes( +# phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True +# ): +# # create phonemes +# _phonemes = None +# _phonemes_sorted = ( +# sorted(list(set(phonemes))) if unique else sorted(list(phonemes)) +# ) # this is to keep previous models compatible. +# _phonemes = list(_phonemes_sorted) +# _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes +# _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes +# _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes +# _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes +# _phonemes = _phonemes + list(punctuations) +# _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) +# return _phonemes + + +# DEF_GRAPHEMES = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos) +# DEF_PHONEMES = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank) class BaseCharacters: @@ -114,7 +122,7 @@ def __init__( eos: str, bos: str, blank: str, - is_unique: bool = True, + is_unique: bool = False, is_sorted: bool = True, ) -> None: self._characters = characters @@ -202,14 +210,20 @@ def _create_vocab(self): _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab self._vocab = _vocab + list(self._punctuations) self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)} - self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)} + self._id_to_char = { + idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension + } if self.is_unique: + duplicates = {x for x in self.vocab if self.vocab.count(x) > 1} assert ( len(self.vocab) == len(self._char_to_id) == len(self._id_to_char) - ), f" [!] There are duplicate characters in the character set. {set([x for x in self.vocab if self.vocab.count(x) > 1])}" + ), f" [!] There are duplicate characters in the character set. {duplicates}" def char_to_id(self, char: str) -> int: - return self._char_to_id[char] + try: + return self._char_to_id[char] + except KeyError as e: + raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e def id_to_char(self, idx: int) -> str: return self._id_to_char[idx] @@ -229,9 +243,23 @@ def print_log(self, level: int = 0): print(f"{indent}| > Num chars: {self.num_chars}") @staticmethod - def init_from_config(config: "Coqpit"): - return BaseCharacters( - **config.characters if config.characters is not None else {}, + def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument + """Init your character class from a config. + + Implement this method for your subclass. + """ + ... + + def to_config(self) -> "CharactersConfig": + return CharactersConfig( + characters=self._characters, + punctuations=self._punctuations, + pad=self._pad, + eos=self._eos, + bos=self._bos, + blank=self._blank, + is_unique=self.is_unique, + is_sorted=self.is_sorted, ) @@ -275,31 +303,42 @@ def __init__( eos: str = _eos, bos: str = _bos, blank: str = _blank, - is_unique: bool = True, + is_unique: bool = False, is_sorted: bool = True, ) -> None: super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted) @staticmethod def init_from_config(config: "Coqpit"): + """Init a IPAPhonemes object from a model config + + If characters are not defined in the config, it will be set to the default characters and the config + will be updated. + """ # band-aid for compatibility with old models if "characters" in config and config.characters is not None: if "phonemes" in config.characters and config.characters.phonemes is not None: config.characters["characters"] = config.characters["phonemes"] - return IPAPhonemes( - characters=config.characters["characters"], - punctuations=config.characters["punctuations"], - pad=config.characters["pad"], - eos=config.characters["eos"], - bos=config.characters["bos"], - blank=config.characters["blank"], - is_unique=config.characters["is_unique"], - is_sorted=config.characters["is_sorted"], - ) - else: - return IPAPhonemes( - **config.characters if config.characters is not None else {}, + return ( + IPAPhonemes( + characters=config.characters["characters"], + punctuations=config.characters["punctuations"], + pad=config.characters["pad"], + eos=config.characters["eos"], + bos=config.characters["bos"], + blank=config.characters["blank"], + is_unique=config.characters["is_unique"], + is_sorted=config.characters["is_sorted"], + ), + config, ) + # use character set from config + if config.characters is not None: + return IPAPhonemes(**config.characters), config + # return default character set + characters = IPAPhonemes() + new_config = replace(config, characters=characters.to_config()) + return characters, new_config class Graphemes(BaseCharacters): @@ -339,24 +378,42 @@ def __init__( eos: str = _eos, bos: str = _bos, blank: str = _blank, - is_unique: bool = True, + is_unique: bool = False, is_sorted: bool = True, ) -> None: super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted) @staticmethod def init_from_config(config: "Coqpit"): - return Graphemes( - **config.characters if config.characters is not None else {}, - ) + """Init a Graphemes object from a model config + + If characters are not defined in the config, it will be set to the default characters and the config + will be updated. + """ + if config.characters is not None: + # band-aid for compatibility with old models + if "phonemes" in config.characters: + return ( + Graphemes( + characters=config.characters["characters"], + punctuations=config.characters["punctuations"], + pad=config.characters["pad"], + eos=config.characters["eos"], + bos=config.characters["bos"], + blank=config.characters["blank"], + is_unique=config.characters["is_unique"], + is_sorted=config.characters["is_sorted"], + ), + config, + ) + return Graphemes(**config.characters), config + characters = Graphemes() + new_config = replace(config, characters=characters.to_config()) + return characters, new_config if __name__ == "__main__": gr = Graphemes() ph = IPAPhonemes() - - print(gr.vocab) - print(ph.vocab) - - print(gr.num_chars) - assert "a" == gr.id_to_char(gr.char_to_id("a")) + gr.print_log() + ph.print_log() diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py index 249c8bce83..08fa8e130a 100644 --- a/TTS/tts/utils/text/phonemizers/base.py +++ b/TTS/tts/utils/text/phonemizers/base.py @@ -1,6 +1,5 @@ import abc -import itertools -from typing import List, Tuple, Union +from typing import List, Tuple from TTS.tts.utils.text.punctuation import Punctuation @@ -8,6 +7,19 @@ class BasePhonemizer(abc.ABC): """Base phonemizer class + Phonemization follows the following steps: + 1. Preprocessing: + - remove empty lines + - remove punctuation + - keep track of punctuation marks + + 2. Phonemization: + - convert text to phonemes + + 3. Postprocessing: + - join phonemes + - restore punctuation marks + Args: language (str): Language used by the phonemizer. @@ -51,40 +63,30 @@ def language(self): @abc.abstractmethod def name(): """The name of the backend""" + ... @classmethod @abc.abstractmethod def is_available(cls): """Returns True if the backend is installed, False otherwise""" + ... @classmethod @abc.abstractmethod def version(cls): """Return the backend version as a tuple (major, minor, patch)""" + ... + @staticmethod @abc.abstractmethod def supported_languages(): """Return a dict of language codes -> name supported by the backend""" + ... def is_supported_language(self, language): """Returns True if `language` is supported by the backend""" return language in self.supported_languages() - fr""" - Phonemization follows the following steps: - 1. Preprocessing: - - remove empty lines - - remove punctuation - - keep track of punctuation marks - - 2. Phonemization: - - convert text to phonemes - - 3. Postprocessing: - - join phonemes - - restore punctuation marks - """ - @abc.abstractmethod def _phonemize(self, text, separator): """The main phonemization method""" diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index f1d0b6cd2b..3cccee41d6 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -28,29 +28,30 @@ def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]: "1", # UTF8 text encoding ] cmd.extend(args) - logging.debug("espeakng: executing %s" % repr(cmd)) - p = subprocess.Popen( + logging.debug("espeakng: executing %s", repr(cmd)) + + with subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - ) - res = iter(p.stdout.readline, b"") - if not sync: + ) as p: + res = iter(p.stdout.readline, b"") + if not sync: + p.stdout.close() + if p.stderr: + p.stderr.close() + if p.stdin: + p.stdin.close() + return res + res2 = [] + for line in res: + res2.append(line) p.stdout.close() if p.stderr: p.stderr.close() if p.stdin: p.stdin.close() - return res - res2 = [] - for line in res: - res2.append(line) - p.stdout.close() - if p.stderr: - p.stderr.close() - if p.stdin: - p.stdin.close() - p.wait() + p.wait() return res2 @@ -85,7 +86,24 @@ class ESpeak(BasePhonemizer): def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True): if self._ESPEAK_LIB is None: raise Exception("Unknown backend: %s" % backend) + + # band-aid for backwards compatibility + if language == "en": + language = "en-us" + super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs) + if backend is not None: + self.backend = backend + + @property + def backend(self): + return self._ESPEAK_LIB + + @backend.setter + def backend(self, backend): + if backend not in ["espeak", "espeak-ng"]: + raise Exception("Unknown backend: %s" % backend) + self._ESPEAK_LIB = backend def auto_set_espeak_lib(self) -> None: if is_tool("espeak-ng"): @@ -115,24 +133,25 @@ def phonemize_espeak(self, text: str, separator: str = "|", tie=False) -> str: # espeak and espeak-ng parses `ipa` differently if tie: # use '͡' between phonemes - if _DEF_ESPEAK_LIB == "espeak": + if self.backend == "espeak": args.append("--ipa=1") else: args.append("--ipa=3") else: # split with '_' - if _DEF_ESPEAK_LIB == "espeak": + if self.backend == "espeak": args.append("--ipa=3") else: args.append("--ipa=1") if tie: args.append("--tie=%s" % tie) + args.append('"' + text + '"') # compute phonemes phonemes = "" for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True): - logging.debug("line: %s" % repr(line)) - phonemes += line.decode("utf8").strip() + logging.debug("line: %s", repr(line)) + phonemes += line.decode("utf8").strip()[2:] # skip two redundant characters return phonemes.replace("_", separator) def _phonemize(self, text, separator=None): @@ -146,7 +165,7 @@ def supported_languages() -> Dict: Dict: Dictionary of language codes. """ if _DEF_ESPEAK_LIB is None: - raise {} + return {} args = ["--voices"] langs = {} count = 0 @@ -157,7 +176,7 @@ def supported_languages() -> Dict: lang_code = cols[1] lang_name = cols[3] langs[lang_code] = lang_name - logging.debug("line: %s" % repr(line)) + logging.debug("line: %s", repr(line)) count += 1 return langs @@ -168,9 +187,9 @@ def version(self) -> str: str: Version of the used backend. """ args = ["--version"] - for line in _espeak_exe(_DEF_ESPEAK_LIB, args, sync=True): + for line in _espeak_exe(self.backend, args, sync=True): version = line.decode("utf8").strip().split()[2] - logging.debug("line: %s" % repr(line)) + logging.debug("line: %s", repr(line)) return version @classmethod diff --git a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py index d0aa469e26..f3e9c9abd4 100644 --- a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py @@ -1,5 +1,4 @@ import importlib -from os import stat from typing import List import gruut @@ -55,7 +54,7 @@ def __init__( def name(): return "gruut" - def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: + def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument """Convert input text to phonemes. Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters diff --git a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py index 4f93edeb6b..60b965f9d8 100644 --- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py @@ -30,7 +30,7 @@ class JA_JP_Phonemizer(BasePhonemizer): language = "ja-jp" - def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): + def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) @staticmethod @@ -61,12 +61,12 @@ def is_available(self) -> bool: return True -if __name__ == "__main__": - text = "これは、電話をかけるための私の日本語の例のテキストです。" - e = JA_JP_Phonemizer() - print(e.supported_languages()) - print(e.version()) - print(e.language) - print(e.name()) - print(e.is_available()) - print("`" + e.phonemize(text) + "`") +# if __name__ == "__main__": +# text = "これは、電話をかけるための私の日本語の例のテキストです。" +# e = JA_JP_Phonemizer() +# print(e.supported_languages()) +# print(e.version()) +# print(e.language) +# print(e.name()) +# print(e.is_available()) +# print("`" + e.phonemize(text) + "`") diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py index e8b2ce347e..e36b0a2a1f 100644 --- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py @@ -17,7 +17,7 @@ class MultiPhonemizer: lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER language = "multi-lingual" - def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: + def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: # pylint: disable=dangerous-default-value self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer) self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name) @@ -40,16 +40,16 @@ def supported_languages(self) -> List: return list(self.lang_to_phonemizer_name.keys()) -if __name__ == "__main__": - texts = { - "tr": "Merhaba, bu Türkçe bit örnek!", - "en-us": "Hello, this is English example!", - "de": "Hallo, das ist ein Deutches Beipiel!", - "zh-cn": "这是中国的例子", - } - phonemes = {} - ph = MultiPhonemizer() - for lang, text in texts.items(): - phoneme = ph.phonemize(text, lang) - phonemes[lang] = phoneme - print(phonemes) +# if __name__ == "__main__": +# texts = { +# "tr": "Merhaba, bu Türkçe bit örnek!", +# "en-us": "Hello, this is English example!", +# "de": "Hallo, das ist ein Deutches Beipiel!", +# "zh-cn": "这是中国的例子", +# } +# phonemes = {} +# ph = MultiPhonemizer() +# for lang, text in texts.items(): +# phoneme = ph.phonemize(text, lang) +# phonemes[lang] = phoneme +# print(phonemes) diff --git a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py index e1bd77c7d8..5a4a55911d 100644 --- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py @@ -25,14 +25,15 @@ class ZH_CN_Phonemizer(BasePhonemizer): language = "zh-cn" - def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): + def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) @staticmethod def name(): return "zh_cn_phonemizer" - def phonemize_zh_cn(self, text: str, separator: str = "|") -> str: + @staticmethod + def phonemize_zh_cn(text: str, separator: str = "|") -> str: ph = chinese_text_to_phonemes(text, separator) return ph @@ -50,12 +51,12 @@ def is_available(self) -> bool: return True -if __name__ == "__main__": - text = "这是,样本中文。" - e = ZH_CN_Phonemizer() - print(e.supported_languages()) - print(e.version()) - print(e.language) - print(e.name()) - print(e.is_available()) - print("`" + e.phonemize(text) + "`") +# if __name__ == "__main__": +# text = "这是,样本中文。" +# e = ZH_CN_Phonemizer() +# print(e.supported_languages()) +# print(e.version()) +# print(e.language) +# print(e.name()) +# print(e.is_available()) +# print("`" + e.phonemize(text) + "`") diff --git a/TTS/tts/utils/text/punctuation.py b/TTS/tts/utils/text/punctuation.py index 414ac25366..09087d5fcf 100644 --- a/TTS/tts/utils/text/punctuation.py +++ b/TTS/tts/utils/text/punctuation.py @@ -130,7 +130,7 @@ def restore(cls, text, puncs): return cls._restore(text, puncs, 0) @classmethod - def _restore(cls, text, puncs, num): + def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements """Auxiliary method for Punctuation.restore()""" if not puncs: return text @@ -159,14 +159,14 @@ def _restore(cls, text, puncs, num): return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num) -if __name__ == "__main__": - punc = Punctuation() - text = "This is. This is, example!" +# if __name__ == "__main__": +# punc = Punctuation() +# text = "This is. This is, example!" - print(punc.strip(text)) +# print(punc.strip(text)) - split_text, puncs = punc.strip_to_restore(text) - print(split_text, " ---- ", puncs) +# split_text, puncs = punc.strip_to_restore(text) +# print(split_text, " ---- ", puncs) - restored_text = punc.restore(split_text, puncs) - print(restored_text) +# restored_text = punc.restore(split_text, puncs) +# print(restored_text) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 9c6bf4541e..55ce49b508 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -383,8 +383,7 @@ def __init__( def init_from_config(config: "Coqpit"): if "audio" in config: return AudioProcessor(**config.audio) - else: - return AudioProcessor(**config) + return AudioProcessor(**config) ### setting up the parameters ### def _build_mel_basis( diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2e4f4735bc..f6a1ae6ab1 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -13,7 +13,6 @@ # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import from TTS.tts.utils.synthesis import synthesis, trim_silence -from TTS.tts.utils.text import TTSTokenizer from TTS.utils.audio import AudioProcessor from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index e56d1db493..f78d69b86e 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -314,7 +314,7 @@ def get_data_loader( # pylint: disable=no-self-use data_items: List, verbose: bool, num_gpus: int, - rank: int = 0, # pylint: disable=unused-argument + rank: int = None, # pylint: disable=unused-argument ): """Initiate and return the GAN dataloader. From 49fef8db61f9fd59104b8887df92b481aef3698c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:52:45 +0000 Subject: [PATCH 12/67] Allow None pad and blank tokens --- TTS/tts/utils/text/tokenizer.py | 48 ++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index 68a1c57548..3f416bbb86 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -57,8 +57,8 @@ def characters(self): @characters.setter def characters(self, new_characters): self._characters = new_characters - self.pad_id = self.characters.char_to_id(self.characters.pad) - self.blank_id = self.characters.char_to_id(self.characters.blank) + self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None + self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None def encode(self, text: str) -> List[int]: """Encodes a string of text as a sequence of IDs.""" @@ -82,7 +82,7 @@ def decode(self, token_ids: List[int]) -> str: text += self.characters.id_to_char(token_id) return text - def text_to_ids(self, text: str, language: str = None) -> List[int]: + def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument """Converts a string of text to a sequence of token IDs. Args: @@ -137,32 +137,50 @@ def print_logs(self, level: int = 0): print(f"{indent}| > {char}") @staticmethod - def init_from_config(config: "Coqpit"): + def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): """Init Tokenizer object from config Args: config (Coqpit): Coqpit model config. + characters (BaseCharacters): Defines the model character set. If not set, use the default options based on + the config values. Defaults to None. """ # init cleaners if isinstance(config.text_cleaner, (str, list)): text_cleaner = getattr(cleaners, config.text_cleaner) + # init characters + if characters is None: + if config.use_phonemes: + # init phoneme set + characters, new_config = IPAPhonemes().init_from_config(config) + else: + # init character set + characters, new_config = Graphemes().init_from_config(config) + else: + characters, new_config = characters.init_from_config(config) + + # init phonemizer phonemizer = None if config.use_phonemes: - # init phoneme set - characters = IPAPhonemes().init_from_config(config) phonemizer_kwargs = {"language": config.phoneme_language} - # init phonemizer if "phonemizer" in config and config.phonemizer: phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs) else: - phonemizer = get_phonemizer_by_name( - DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs - ) - else: - # init character set - characters = Graphemes().init_from_config(config) - return TTSTokenizer( - config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars + try: + phonemizer = get_phonemizer_by_name( + DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs + ) + except KeyError as e: + raise ValueError( + f"""No phonemizer found for language {config.phoneme_language}. + You may need to install a third party library for this language.""" + ) from e + + return ( + TTSTokenizer( + config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars + ), + new_config, ) From 7deadfea4a9edb3d358e6f987b3c1e12fc960d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:53:25 +0000 Subject: [PATCH 13/67] Use the same phonemizer for `en` to `en-us` --- TTS/tts/utils/text/phonemizers/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py index 0da5875ea7..5dc117c469 100644 --- a/TTS/tts/utils/text/phonemizers/__init__.py +++ b/TTS/tts/utils/text/phonemizers/__init__.py @@ -29,6 +29,8 @@ _new_dict = dict(list(zip(list(ESPEAK_LANGS), _))) DEF_LANG_TO_PHONEMIZER.update(_new_dict) +DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] + def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: """Initiate a phonemizer by name From 83b9fda57c292487b10496eeba7d85bed17ad701 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:54:39 +0000 Subject: [PATCH 14/67] Pass samples to init_from_config in SpeakerManager --- TTS/tts/utils/speakers.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 5e46354bbc..9d2e6fe30c 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -319,23 +319,27 @@ def plot_embeddings(self): raise NotImplementedError @staticmethod - def init_from_config(config: "Coqpit"): + def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager": """Initialize a speaker manager from config Args: config (Coqpit): Config object. + samples (Union[List[List], List[Dict]], optional): List of data samples to parse out the speaker names. + Defaults to None. Returns: SpeakerEncoder: Speaker encoder object. """ speaker_manager = None - if hasattr(config, "use_speaker_embedding") and config.use_speaker_embedding is True: + if hasattr(config, "use_speaker_embedding") and config.use_speaker_embedding: + if samples: + speaker_manager = SpeakerManager(data_items=samples) if config.get("speaker_file", None): speaker_manager = SpeakerManager(speaker_id_file_path=config.speaker_file) if config.get("speakers_file", None): speaker_manager = SpeakerManager(speaker_id_file_path=config.speakers_file) - if hasattr(config, "use_d_vector_file") and config.use_speaker_embedding is True: + if hasattr(config, "use_d_vector_file") and config.use_d_vector_file: if config.get("speakers_file", None): speaker_manager = SpeakerManager(d_vectors_file_path=config.speaker_file) if config.get("d_vector_file", None): From ae96243e24d4412261e33cb46ac92e48d484b4bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:55:18 +0000 Subject: [PATCH 15/67] Update VITS for the new API --- TTS/tts/models/vits.py | 210 ++++++++++++++++++++--------------------- 1 file changed, 105 insertions(+), 105 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index aa578ff8cd..957994f989 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1,7 +1,8 @@ import math -from dataclasses import dataclass, field +import random +from dataclasses import dataclass, field, replace from itertools import chain -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Union import torch @@ -11,6 +12,7 @@ from torch.cuda.amp.autocast_mode import autocast from torch.nn import functional as F +from TTS.tts.configs.shared_configs import CharactersConfig from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.vits.discriminator import VitsDiscriminator from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder @@ -20,6 +22,7 @@ from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis +from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment from TTS.utils.trainer_utils import get_optimizer, get_scheduler @@ -280,91 +283,78 @@ def __init__( self.END2END = True self.speaker_manager = speaker_manager self.language_manager = language_manager - if config.__class__.__name__ == "VitsConfig": - # loading from VitsConfig - self.num_chars = self.tokenizer.characters.num_chars - self.config = config - args = self.config.model_args - elif isinstance(config, VitsArgs): - # loading from VitsArgs - self.config = config - args = config - else: - raise ValueError("config must be either a VitsConfig or VitsArgs") self.args = args self.init_multispeaker(config) self.init_multilingual(config) - self.length_scale = args.length_scale - self.noise_scale = args.noise_scale - self.inference_noise_scale = args.inference_noise_scale - self.inference_noise_scale_dp = args.inference_noise_scale_dp - self.noise_scale_dp = args.noise_scale_dp - self.max_inference_len = args.max_inference_len - self.spec_segment_size = args.spec_segment_size + self.length_scale = self.args.length_scale + self.noise_scale = self.args.noise_scale + self.inference_noise_scale = self.args.inference_noise_scale + self.inference_noise_scale_dp = self.args.inference_noise_scale_dp + self.noise_scale_dp = self.args.noise_scale_dp + self.max_inference_len = self.args.max_inference_len + self.spec_segment_size = self.args.spec_segment_size self.text_encoder = TextEncoder( - args.num_chars, - args.hidden_channels, - args.hidden_channels, - args.hidden_channels_ffn_text_encoder, - args.num_heads_text_encoder, - args.num_layers_text_encoder, - args.kernel_size_text_encoder, - args.dropout_p_text_encoder, - language_emb_dim=self.embedded_language_dim, + self.args.num_chars, + self.args.hidden_channels, + self.args.hidden_channels, + self.args.hidden_channels_ffn_text_encoder, + self.args.num_heads_text_encoder, + self.args.num_layers_text_encoder, + self.args.kernel_size_text_encoder, + self.args.dropout_p_text_encoder, ) self.posterior_encoder = PosteriorEncoder( - args.out_channels, - args.hidden_channels, - args.hidden_channels, - kernel_size=args.kernel_size_posterior_encoder, - dilation_rate=args.dilation_rate_posterior_encoder, - num_layers=args.num_layers_posterior_encoder, + self.args.out_channels, + self.args.hidden_channels, + self.args.hidden_channels, + kernel_size=self.args.kernel_size_posterior_encoder, + dilation_rate=self.args.dilation_rate_posterior_encoder, + num_layers=self.args.num_layers_posterior_encoder, cond_channels=self.embedded_speaker_dim, ) self.flow = ResidualCouplingBlocks( - args.hidden_channels, - args.hidden_channels, - kernel_size=args.kernel_size_flow, - dilation_rate=args.dilation_rate_flow, - num_layers=args.num_layers_flow, + self.args.hidden_channels, + self.args.hidden_channels, + kernel_size=self.args.kernel_size_flow, + dilation_rate=self.args.dilation_rate_flow, + num_layers=self.args.num_layers_flow, cond_channels=self.embedded_speaker_dim, ) - if args.use_sdp: + if self.args.use_sdp: self.duration_predictor = StochasticDurationPredictor( - args.hidden_channels, + self.args.hidden_channels, 192, 3, - args.dropout_p_duration_predictor, + self.args.dropout_p_duration_predictor, 4, cond_channels=self.embedded_speaker_dim, language_emb_dim=self.embedded_language_dim, ) else: self.duration_predictor = DurationPredictor( - args.hidden_channels, + self.args.hidden_channels, 256, 3, - args.dropout_p_duration_predictor, + self.args.dropout_p_duration_predictor, cond_channels=self.embedded_speaker_dim, - language_emb_dim=self.embedded_language_dim, ) self.waveform_decoder = HifiganGenerator( - args.hidden_channels, + self.args.hidden_channels, 1, - args.resblock_type_decoder, - args.resblock_dilation_sizes_decoder, - args.resblock_kernel_sizes_decoder, - args.upsample_kernel_sizes_decoder, - args.upsample_initial_channel_decoder, - args.upsample_rates_decoder, + self.args.resblock_type_decoder, + self.args.resblock_dilation_sizes_decoder, + self.args.resblock_kernel_sizes_decoder, + self.args.upsample_kernel_sizes_decoder, + self.args.upsample_initial_channel_decoder, + self.args.upsample_rates_decoder, inference_padding=0, cond_channels=self.embedded_speaker_dim, conv_pre_weight_norm=False, @@ -372,8 +362,8 @@ def __init__( conv_post_bias=False, ) - if args.init_discriminator: - self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator) + if self.args.init_discriminator: + self.disc = VitsDiscriminator(use_spectral_norm=self.args.use_spectral_norm_disriminator) def init_multispeaker(self, config: Coqpit): """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer @@ -878,19 +868,17 @@ def train_log( Returns: Tuple[Dict, np.ndarray]: training plots and output waveform. """ - ap = assets["audio_processor"] - self._log(ap, batch, outputs, "train") + self._log(self.ap, batch, outputs, "train") @torch.no_grad() def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int): return self.train_step(batch, criterion, optimizer_idx) def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: - ap = assets["audio_processor"] - return self._log(ap, batch, outputs, "eval") + return self._log(self.ap, batch, outputs, "eval") @torch.no_grad() - def test_run(self, ap) -> Tuple[Dict, Dict]: + def test_run(self) -> Tuple[Dict, Dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -985,36 +973,6 @@ def get_criterion(self): return [VitsGeneratorLoss(self.config), VitsDiscriminatorLoss(self.config)] - @staticmethod - def make_symbols(config): - """Create a custom arrangement of symbols used by the model. The output list of symbols propagate along the - whole training and inference steps.""" - _pad = config.characters["pad"] - _punctuations = config.characters["punctuations"] - _letters = config.characters["characters"] - _letters_ipa = config.characters["phonemes"] - symbols = [_pad] + list(_punctuations) + list(_letters) - if config.use_phonemes: - symbols += list(_letters_ipa) - return symbols - - @staticmethod - def get_characters(config: Coqpit): - if config.characters is not None: - symbols = Vits.make_symbols(config) - else: - from TTS.tts.utils.text.symbols import ( # pylint: disable=import-outside-toplevel - parse_symbols, - phonemes, - symbols, - ) - - config.characters = parse_symbols() - if config.use_phonemes: - symbols = phonemes - num_chars = len(symbols) + getattr(config, "add_blank", False) - return symbols, config, num_chars - def load_checkpoint( self, config, checkpoint_path, eval=False ): # pylint: disable=unused-argument, redefined-builtin @@ -1030,23 +988,65 @@ def load_checkpoint( assert not self.training @staticmethod - def init_from_config(config: "Coqpit"): - """Initialize model from config.""" + def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None): + """Initiate model from config - # init characters - if config.use_phonemes: - from TTS.tts.utils.text.characters import IPAPhonemes + Args: + config (VitsConfig): Model config. + samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. + Defaults to None. + """ + from TTS.utils.audio import AudioProcessor - characters = IPAPhonemes().init_from_config(config) - else: - from TTS.tts.utils.text.characters import Graphemes + ap = AudioProcessor.init_from_config(config) + tokenizer, new_config = TTSTokenizer.init_from_config(config) + speaker_manager = SpeakerManager.init_from_config(config, samples) + return Vits(new_config, ap, tokenizer, speaker_manager) - characters = Graphemes().init_from_config(config) - config.num_chars = characters.num_chars - from TTS.utils.audio import AudioProcessor +class VitsCharacters(BaseCharacters): + """Characters class for VITs model for compatibility with pre-trained models""" - ap = AudioProcessor.init_from_config(config) - tokenizer = TTSTokenizer.init_from_config(config) - speaker_manager = SpeakerManager.init_from_config(config) - return Vits(config, ap, tokenizer, speaker_manager) + def __init__( + self, + graphemes: str = _characters, + punctuations: str = _punctuations, + pad: str = _pad, + ipa_characters: str = _phonemes, + ) -> None: + if ipa_characters is not None: + graphemes += ipa_characters + super().__init__(graphemes, punctuations, pad, None, None, "", is_unique=False, is_sorted=True) + + def _create_vocab(self): + self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank] + self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)} + # pylint: disable=unnecessary-comprehension + self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)} + + @staticmethod + def init_from_config(config: Coqpit): + if config.characters is not None: + _pad = config.characters["pad"] + _punctuations = config.characters["punctuations"] + _letters = config.characters["characters"] + _letters_ipa = config.characters["phonemes"] + return ( + VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad), + config, + ) + characters = VitsCharacters() + new_config = replace(config, characters=characters.to_config()) + return characters, new_config + + def to_config(self) -> "CharactersConfig": + return CharactersConfig( + characters=self._characters, + punctuations=self._punctuations, + pad=self._pad, + eos=None, + bos=None, + blank=self._blank, + is_unique=False, + is_sorted=True, + ) From 160115bbdec9bb45eb970abef12e721e0c0b3c99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:55:45 +0000 Subject: [PATCH 16/67] Update Tacotron models --- TTS/tts/models/base_tacotron.py | 22 +++++++++++++++-- TTS/tts/models/tacotron.py | 43 +++++++++++++++++++++++--------- TTS/tts/models/tacotron2.py | 44 +++++++++++++++++++++++---------- 3 files changed, 82 insertions(+), 27 deletions(-) diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index ca8f3bb9ed..54939c61c1 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -9,6 +9,8 @@ from TTS.tts.layers.losses import TacotronLoss from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.helpers import sequence_mask +from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.generic_utils import format_aux_input from TTS.utils.io import load_fsspec from TTS.utils.training import gradual_training_scheduler @@ -17,8 +19,14 @@ class BaseTacotron(BaseTTS): """Base class shared by Tacotron and Tacotron2""" - def __init__(self, config: Coqpit): - super().__init__(config) + def __init__( + self, + config: "TacotronConfig", + ap: "AudioProcessor", + tokenizer: "TTSTokenizer", + speaker_manager: SpeakerManager = None, + ): + super().__init__(config, ap, tokenizer, speaker_manager) # pass all config fields as class attributes for key in config: @@ -107,6 +115,16 @@ def get_criterion(self) -> nn.Module: """Get the model criterion used in training.""" return TacotronLoss(self.config) + @staticmethod + def init_from_config(config: Coqpit): + """Initialize model from config.""" + from TTS.utils.audio import AudioProcessor + + ap = AudioProcessor.init_from_config(config) + tokenizer = TTSTokenizer.init_from_config(config) + speaker_manager = SpeakerManager.init_from_config(config) + return BaseTacotron(config, ap, tokenizer, speaker_manager) + ############################# # COMMON COMPUTE FUNCTIONS ############################# diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 4e46d252bf..8341f5bbd2 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -1,7 +1,8 @@ # coding: utf-8 +from typing import Dict, List, Union + import torch -from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast @@ -10,6 +11,7 @@ from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -24,12 +26,15 @@ class Tacotron(BaseTacotron): a multi-speaker model. Defaults to None. """ - def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None): - super().__init__(config) + def __init__( + self, + config: "TacotronConfig", + ap: "AudioProcessor" = None, + tokenizer: "TTSTokenizer" = None, + speaker_manager: SpeakerManager = None, + ): - self.speaker_manager = speaker_manager - chars, self.config, _ = self.get_characters(config) - config.num_chars = self.num_chars = len(chars) + super().__init__(config, ap, tokenizer, speaker_manager) # pass all config fields to `self` # for fewer code change @@ -302,16 +307,30 @@ def _create_logs(self, batch, outputs, ap): def train_log( self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int ) -> None: # pylint: disable=no-self-use - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) - logger.train_audios(steps, audios, ap.sample_rate) + logger.train_audios(steps, audios, self.ap.sample_rate) def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.eval_figures(steps, figures) - logger.eval_audios(steps, audios, ap.sample_rate) + logger.eval_audios(steps, audios, self.ap.sample_rate) + + @staticmethod + def init_from_config(config: "TacotronConfig", samples: Union[List[List], List[Dict]] = None): + """Initiate model from config + + Args: + config (TacotronConfig): Model config. + samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. + Defaults to None. + """ + from TTS.utils.audio import AudioProcessor + + ap = AudioProcessor.init_from_config(config) + tokenizer, new_config = TTSTokenizer.init_from_config(config) + speaker_manager = SpeakerManager.init_from_config(config, samples) + return Tacotron(new_config, ap, tokenizer, speaker_manager) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index ead3bf2b8e..d4e665e347 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,9 +1,8 @@ # coding: utf-8 -from typing import Dict +from typing import Dict, List, Union import torch -from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast @@ -12,6 +11,7 @@ from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -40,12 +40,16 @@ class Tacotron2(BaseTacotron): Speaker manager for multi-speaker training. Uuse only for multi-speaker training. Defaults to None. """ - def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None): - super().__init__(config) + def __init__( + self, + config: "Tacotron2Config", + ap: "AudioProcessor" = None, + tokenizer: "TTSTokenizer" = None, + speaker_manager: SpeakerManager = None, + ): + + super().__init__(config, ap, tokenizer, speaker_manager) - self.speaker_manager = speaker_manager - chars, self.config, _ = self.get_characters(config) - config.num_chars = len(chars) self.decoder_output_dim = config.out_channels # pass all config fields to `self` @@ -325,16 +329,30 @@ def train_log( self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int ) -> None: # pylint: disable=no-self-use """Log training progress.""" - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) - logger.train_audios(steps, audios, ap.sample_rate) + logger.train_audios(steps, audios, self.ap.sample_rate) def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.eval_figures(steps, figures) - logger.eval_audios(steps, audios, ap.sample_rate) + logger.eval_audios(steps, audios, self.ap.sample_rate) + + @staticmethod + def init_from_config(config: "Tacotron2Config", samples: Union[List[List], List[Dict]] = None): + """Initiate model from config + + Args: + config (Tacotron2Config): Model config. + samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. + Defaults to None. + """ + from TTS.utils.audio import AudioProcessor + + ap = AudioProcessor.init_from_config(config) + tokenizer, new_config = TTSTokenizer.init_from_config(config) + speaker_manager = SpeakerManager.init_from_config(new_config, samples) + return Tacotron2(new_config, ap, tokenizer, speaker_manager) From 0ff11d4a356e6e99a8deb3af35bec53adcd830fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:56:16 +0000 Subject: [PATCH 17/67] Update ForwardTTS --- TTS/tts/models/base_tts.py | 19 +++++++---------- TTS/tts/models/forward_tts.py | 40 ++++++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 272317905b..5986232231 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -1,6 +1,6 @@ import os import random -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Union import torch import torch.distributed as dist @@ -56,9 +56,10 @@ def _set_model_args(self, config: Coqpit): """ # don't use isintance not to import recursively if "Config" in config.__class__.__name__: - num_chars = ( - self.config.model_args.num_chars if self.tokenizer is None else self.tokenizer.characters.num_chars + config_num_chars = ( + self.config.model_args.num_chars if hasattr(self.config, "model_args") else self.config.num_chars ) + num_chars = config_num_chars if self.tokenizer is None else self.tokenizer.characters.num_chars if "characters" in config: self.config.num_chars = num_chars if hasattr(self.config, "model_args"): @@ -237,7 +238,7 @@ def get_data_loader( config: Coqpit, assets: Dict, is_eval: bool, - data_items: List, + samples: Union[List[Dict], List[List]], verbose: bool, num_gpus: int, rank: int = None, @@ -274,7 +275,7 @@ def get_data_loader( compute_linear_spec=config.model.lower() == "tacotron" or config.compute_linear_spec, compute_f0=config.get("compute_f0", False), f0_cache_path=config.get("f0_cache_path", None), - meta_data=data_items, + samples=samples, ap=self.ap, return_wav=config.return_wav if "return_wav" in config else False, batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size, @@ -283,6 +284,7 @@ def get_data_loader( min_audio_len=config.min_audio_len, max_audio_len=config.max_audio_len, phoneme_cache_path=config.phoneme_cache_path, + precompute_num_workers=config.precompute_num_workers, use_noise_augment=False if is_eval else config.use_noise_augment, verbose=verbose, speaker_id_mapping=speaker_id_mapping, @@ -357,8 +359,6 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: Returns: Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. """ - ap = assets["audio_processor"] - tokenizer = assets["tokenizer"] print(" | > Synthesizing test sentences.") test_audios = {} test_figures = {} @@ -370,18 +370,15 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: sen, self.config, "cuda" in str(next(self.parameters()).device), - ap, - tokenizer, speaker_id=aux_inputs["speaker_id"], d_vector=aux_inputs["d_vector"], style_wav=aux_inputs["style_wav"], - enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, ) test_audios["{}-audio".format(idx)] = outputs_dict["wav"] test_figures["{}-prediction".format(idx)] = plot_spectrogram( - outputs_dict["outputs"]["model_outputs"], ap, output_fig=False + outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) test_figures["{}-alignment".format(idx)] = plot_alignment( outputs_dict["outputs"]["alignments"], output_fig=False diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index b2c41df5e0..699f31426c 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import Dict, Tuple +from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit @@ -14,6 +14,7 @@ from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram @@ -170,11 +171,16 @@ class ForwardTTS(BaseTTS): """ # pylint: disable=dangerous-default-value - def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None): + def __init__( + self, + config: Coqpit, + ap: "AudioProcessor" = None, + tokenizer: "TTSTokenizer" = None, + speaker_manager: SpeakerManager = None, + ): - super().__init__(config) + super().__init__(config, ap, tokenizer, speaker_manager) - self.speaker_manager = speaker_manager self.init_multispeaker(config) self.max_duration = self.args.max_duration @@ -692,19 +698,17 @@ def _create_logs(self, batch, outputs, ap): def train_log( self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int ) -> None: # pylint: disable=no-self-use - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) - logger.train_audios(steps, audios, ap.sample_rate) + logger.train_audios(steps, audios, self.ap.sample_rate) def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.eval_figures(steps, figures) - logger.eval_audios(steps, audios, ap.sample_rate) + logger.eval_audios(steps, audios, self.ap.sample_rate) def load_checkpoint( self, config, checkpoint_path, eval=False @@ -724,3 +728,19 @@ def on_train_step_start(self, trainer): """Enable binary alignment loss when needed""" if trainer.total_steps_done > self.config.binary_align_loss_start_step: self.use_binary_alignment_loss = True + + @staticmethod + def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None): + """Initiate model from config + + Args: + config (ForwardTTSConfig): Model config. + samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. + Defaults to None. + """ + from TTS.utils.audio import AudioProcessor + + ap = AudioProcessor.init_from_config(config) + tokenizer, new_config = TTSTokenizer.init_from_config(config) + speaker_manager = SpeakerManager.init_from_config(config, samples) + return ForwardTTS(new_config, ap, tokenizer, speaker_manager) From f46ad54b89a5ca28a20fbddbe814a5df21b481c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:56:24 +0000 Subject: [PATCH 18/67] Update AlignTTS --- TTS/tts/models/align_tts.py | 43 ++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 2fc00b0b90..c1e2ffb34f 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -12,6 +13,7 @@ from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.io import load_fsspec @@ -100,11 +102,16 @@ class AlignTTS(BaseTTS): # pylint: disable=dangerous-default-value - def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None): + def __init__( + self, + config: "AlignTTSConfig", + ap: "AudioProcessor" = None, + tokenizer: "TTSTokenizer" = None, + speaker_manager: SpeakerManager = None, + ): - super().__init__(config) + super().__init__(config, ap, tokenizer, speaker_manager) self.speaker_manager = speaker_manager - self.config = config self.phase = -1 self.length_scale = ( float(config.model_args.length_scale) @@ -112,10 +119,6 @@ def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None): else config.model_args.length_scale ) - if not self.config.model_args.num_chars: - _, self.config, num_chars = self.get_characters(config) - self.config.model_args.num_chars = num_chars - self.emb = nn.Embedding(self.config.model_args.num_chars, self.config.model_args.hidden_channels) self.embedded_speaker_dim = 0 @@ -382,19 +385,17 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use def train_log( self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int ) -> None: # pylint: disable=no-self-use - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) - logger.train_audios(steps, audios, ap.sample_rate) + logger.train_audios(steps, audios, self.ap.sample_rate) def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.eval_figures(steps, figures) - logger.eval_audios(steps, audios, ap.sample_rate) + logger.eval_audios(steps, audios, self.ap.sample_rate) def load_checkpoint( self, config, checkpoint_path, eval=False @@ -430,3 +431,19 @@ def _set_phase(config, global_step): def on_epoch_start(self, trainer): """Set AlignTTS training phase on epoch start.""" self.phase = self._set_phase(trainer.config, trainer.total_steps_done) + + @staticmethod + def init_from_config(config: "AlignTTSConfig", samples: Union[List[List], List[Dict]] = None): + """Initiate model from config + + Args: + config (AlignTTSConfig): Model config. + samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. + Defaults to None. + """ + from TTS.utils.audio import AudioProcessor + + ap = AudioProcessor.init_from_config(config) + tokenizer, new_config = TTSTokenizer.init_from_config(config) + speaker_manager = SpeakerManager.init_from_config(config, samples) + return AlignTTS(new_config, ap, tokenizer, speaker_manager) From a8a836578812390f1bb43cbaf39ece64744fce7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:56:31 +0000 Subject: [PATCH 19/67] Update GlowTTS --- TTS/tts/models/glow_tts.py | 48 ++++++++++++++------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index af440072cd..3dd8d5c836 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -1,5 +1,5 @@ import math -from typing import Dict, Tuple, Union +from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit @@ -50,8 +50,8 @@ class GlowTTS(BaseTTS): def __init__( self, config: GlowTTSConfig, - ap: "AudioProcessor", - tokenizer: "TTSTokenizer", + ap: "AudioProcessor" = None, + tokenizer: "TTSTokenizer" = None, speaker_manager: SpeakerManager = None, ): @@ -63,7 +63,6 @@ def __init__( for key in config: setattr(self, key, config[key]) - self.num_chars = self.tokenizer.characters.num_chars self.decoder_output_dim = config.out_channels # init multi-speaker layers if necessary @@ -427,20 +426,18 @@ def _create_logs(self, batch, outputs, ap): def train_log( self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int ) -> None: # pylint: disable=no-self-use - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) - logger.train_audios(steps, audios, ap.sample_rate) + logger.train_audios(steps, audios, self.ap.sample_rate) @torch.no_grad() def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: - ap = assets["audio_processor"] - figures, audios = self._create_logs(batch, outputs, ap) + figures, audios = self._create_logs(batch, outputs, self.ap) logger.eval_figures(steps, figures) - logger.eval_audios(steps, audios, ap.sample_rate) + logger.eval_audios(steps, audios, self.ap.sample_rate) @torch.no_grad() def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: @@ -465,19 +462,16 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: sen, self.config, "cuda" in str(next(self.parameters()).device), - self.ap, - self.tokenizer, speaker_id=aux_inputs["speaker_id"], d_vector=aux_inputs["d_vector"], style_wav=aux_inputs["style_wav"], - enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, ) test_audios["{}-audio".format(idx)] = outputs["wav"] test_figures["{}-prediction".format(idx)] = plot_spectrogram( - outputs["outputs"]["model_outputs"], ap, output_fig=False + outputs["outputs"]["model_outputs"], self.ap, output_fig=False ) test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False) return test_figures, test_audios @@ -514,23 +508,17 @@ def on_train_step_start(self, trainer): self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps @staticmethod - def init_from_config(config: Coqpit): - """Initialize model from config.""" - - # init characters - if config.use_phonemes: - from TTS.tts.utils.text.characters import IPAPhonemes - - characters = IPAPhonemes().init_from_config(config) - else: - from TTS.tts.utils.text.characters import Graphemes - - characters = Graphemes().init_from_config(config) - config.num_chars = characters.num_chars + def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None): + """Initiate model from config + Args: + config (VitsConfig): Model config. + samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. + Defaults to None. + """ from TTS.utils.audio import AudioProcessor ap = AudioProcessor.init_from_config(config) - tokenizer = TTSTokenizer.init_from_config(config) - speaker_manager = SpeakerManager.init_from_config(config) - return GlowTTS(config, ap, tokenizer, speaker_manager) + tokenizer, new_config = TTSTokenizer.init_from_config(config) + speaker_manager = SpeakerManager.init_from_config(config, samples) + return GlowTTS(new_config, ap, tokenizer, speaker_manager) From 4640d59f39644937d9f18cc83fa3514e0f4791ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:56:44 +0000 Subject: [PATCH 20/67] Update setup_model --- TTS/tts/models/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index cb1c2e2133..d76a3bebee 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,12 +1,14 @@ +from typing import Dict, List, Union + from TTS.utils.generic_utils import find_module -def setup_model(config: "Coqpit") -> "BaseTTS": +def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS": print(" > Using model: {}".format(config.model)) # fetch the right model implementation. if "base_model" in config and config["base_model"] is not None: MyModel = find_module("TTS.tts.models", config.base_model.lower()) else: MyModel = find_module("TTS.tts.models", config.model.lower()) - model = MyModel.init_from_config(config) + model = MyModel.init_from_config(config, samples) return model From f8fbbd409f68f0ad8e0875185e037c0e48fb0128 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:57:51 +0000 Subject: [PATCH 21/67] Update BaseTTS config --- TTS/tts/configs/shared_configs.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 10bee3e6c7..c7958fda00 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -78,7 +78,7 @@ class CharactersConfig(Coqpit): is_unique (bool): remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old - models trained with character lists with duplicates. + models trained with character lists with duplicates. Defaults to True. is_sorted (bool): Sort the characters in alphabetical order. Defaults to True. @@ -166,6 +166,9 @@ class BaseTTSConfig(BaseTrainingConfig): compute_linear_spec (bool): If True data loader computes and returns linear spectrograms alongside the other data. + precompute_num_workers (int): + Number of workers to precompute features. Defaults to 0. + use_noise_augment (bool): Augment the input audio with random noise. @@ -207,6 +210,7 @@ class BaseTTSConfig(BaseTrainingConfig): phoneme_cache_path: str = None # vocabulary parameters characters: CharactersConfig = None + add_blank: bool = False # training params batch_group_size: int = 0 loss_masking: bool = None @@ -218,8 +222,8 @@ class BaseTTSConfig(BaseTrainingConfig): max_text_len: int = float("inf") compute_f0: bool = False compute_linear_spec: bool = False + precompute_num_workers: int = 0 use_noise_augment: bool = False - add_blank: bool = False # dataset datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer From ab413fda6aa2883a08625f0efe24a08f157b9d5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:58:08 +0000 Subject: [PATCH 22/67] Update train_tts.py --- TTS/bin/train_tts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 3360a94051..f053e9d75c 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -81,7 +81,6 @@ def main(): model=model, train_samples=train_samples, eval_samples=eval_samples, - training_assets={"audio_processor": ap}, parse_command_line_args=False, ) trainer.fit() From cee01a66e1fd24f5b7b37e3ca51b24fdb0fd5e0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:58:41 +0000 Subject: [PATCH 23/67] Update ljspeech recipes --- recipes/ljspeech/align_tts/train_aligntts.py | 37 ++++++++++------ .../ljspeech/fast_pitch/train_fast_pitch.py | 30 +++++++------ .../ljspeech/fast_speech/train_fast_speech.py | 30 +++++++------ recipes/ljspeech/glow_tts/train_glowtts.py | 3 +- .../speedy_speech/train_speedy_speech.py | 43 +++++++++++-------- .../tacotron2-DCA/train_tacotron_dca.py | 39 +++++++++++------ .../tacotron2-DDC/train_tacotron_ddc.py | 25 +++++++++-- recipes/ljspeech/vits_tts/train_vits.py | 7 +-- recipes/vctk/vits/train_vits.py | 21 ++++++--- 9 files changed, 155 insertions(+), 80 deletions(-) diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py index 68b67d66e7..d0187aa816 100644 --- a/recipes/ljspeech/align_tts/train_aligntts.py +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -1,9 +1,11 @@ import os from TTS.trainer import Trainer, TrainingArgs -from TTS.tts.configs.align_tts_config import AlignTTSConfig, BaseDatasetConfig +from TTS.tts.configs.align_tts_config import AlignTTSConfig +from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.align_tts import AlignTTS +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -31,23 +33,32 @@ datasets=[dataset_config], ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init model -model = AlignTTS(config) +model = AlignTTS(config, ap, tokenizer) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index 0a4a965b63..3a772251c8 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -5,6 +5,7 @@ from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor from TTS.utils.manage import ModelManager @@ -46,9 +47,9 @@ epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + precompute_num_workers=4, print_step=50, print_eval=False, mixed_precision=False, @@ -67,23 +68,28 @@ f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" ) -# init audio processor -ap = AudioProcessor(**config.audio) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init the model -model = ForwardTTS(config) +model = ForwardTTS(config, ap, tokenizer, speaker_manager=None) # init the trainer and 🚀 trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py index a71da94bae..f9f1bc0605 100644 --- a/recipes/ljspeech/fast_speech/train_fast_speech.py +++ b/recipes/ljspeech/fast_speech/train_fast_speech.py @@ -5,6 +5,7 @@ from TTS.tts.configs.fast_speech_config import FastSpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor from TTS.utils.manage import ModelManager @@ -45,9 +46,9 @@ epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + precompute_num_workers=8, print_step=50, print_eval=False, mixed_precision=False, @@ -66,23 +67,28 @@ f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" ) -# init audio processor -ap = AudioProcessor(**config.audio) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init the model -model = ForwardTTS(config) +model = ForwardTTS(config, ap, tokenizer) # init the trainer and 🚀 trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py index 4762a77aa1..dd450a572f 100644 --- a/recipes/ljspeech/glow_tts/train_glowtts.py +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -52,7 +52,8 @@ # INITIALIZE THE TOKENIZER # Tokenizer is used to convert text to sequences of token IDs. -tokenizer = TTSTokenizer.init_from_config(config) +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) # LOAD DATA SAMPLES # Each sample is a list of ```[text, audio_file_path, speaker_name]``` diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py index 6b9683afb6..468e8a5f12 100644 --- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py +++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py @@ -5,6 +5,7 @@ from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -38,9 +39,9 @@ epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + precompute_num_workers=4, print_step=50, print_eval=False, mixed_precision=False, @@ -50,14 +51,22 @@ datasets=[dataset_config], ) -# # compute alignments -# if not config.model_args.use_aligner: -# manager = ModelManager() -# model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA") -# # TODO: make compute_attention python callable -# os.system( -# f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" -# ) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) + +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init audio processor ap = AudioProcessor(**config.audio.to_dict()) @@ -66,16 +75,14 @@ train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init model -model = ForwardTTS(config) +model = ForwardTTS(config, ap, tokenizer) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py index cf00ccc2b4..97a16ab7fb 100644 --- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py +++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py @@ -6,6 +6,7 @@ from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor # from TTS.tts.datasets.tokenizer import Tokenizer @@ -54,23 +55,35 @@ datasets=[dataset_config], ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) -# init model -model = Tacotron2(config) +# INITIALIZE THE MODEL +# Models take a config object and a speaker manager as input +# Config defines the details of the model like the number of layers, the size of the embedding, etc. +# Speaker manager is used by multi-speaker models. +model = Tacotron2(config, ap, tokenizer) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py index b452094af2..285c416c58 100644 --- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py @@ -6,6 +6,7 @@ from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor # from TTS.tts.datasets.tokenizer import Tokenizer @@ -46,6 +47,7 @@ use_phonemes=True, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + precompute_num_workers=8, print_step=25, print_eval=True, mixed_precision=False, @@ -56,11 +58,28 @@ # init audio processor ap = AudioProcessor(**config.audio.to_dict()) -# load training samples +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) + +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) -# init model -model = Tacotron2(config) +# INITIALIZE THE MODEL +# Models take a config object and a speaker manager as input +# Config defines the details of the model like the number of layers, the size of the embedding, etc. +# Speaker manager is used by multi-speaker models. +model = Tacotron2(config, ap, tokenizer, speaker_manager=None) # init the trainer and 🚀 trainer = Trainer( diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py index 0588e9d9cf..79c0db2e9f 100644 --- a/recipes/ljspeech/vits_tts/train_vits.py +++ b/recipes/ljspeech/vits_tts/train_vits.py @@ -33,7 +33,7 @@ config = VitsConfig( audio=audio_config, run_name="vits_ljspeech", - batch_size=48, + batch_size=16, eval_batch_size=16, batch_group_size=5, num_loader_workers=0, @@ -48,7 +48,7 @@ compute_input_seq_cache=True, print_step=25, print_eval=True, - mixed_precision=True, + mixed_precision=False, max_seq_len=500000, output_path=output_path, datasets=[dataset_config], @@ -61,7 +61,8 @@ # INITIALIZE THE TOKENIZER # Tokenizer is used to convert text to sequences of token IDs. -tokenizer = TTSTokenizer.init_from_config(config) +# config is updated with the default characters if not defined in the config. +tokenizer, config = TTSTokenizer.init_from_config(config) # LOAD DATA SAMPLES # Each sample is a list of ```[text, audio_file_path, speaker_name]``` diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index 7eb741c4d9..2906557dde 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -7,6 +7,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.vits import Vits, VitsArgs from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -63,10 +64,21 @@ datasets=[dataset_config], ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# config is updated with the default characters if not defined in the config. +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -76,7 +88,7 @@ config.model_args.num_speakers = speaker_manager.num_speakers # init model -model = Vits(config, speaker_manager) +model = Vits(config, ap, tokenizer, speaker_manager) # init the trainer and 🚀 trainer = Trainer( @@ -86,6 +98,5 @@ model=model, train_samples=train_samples, eval_samples=eval_samples, - training_assets={"audio_processor": ap}, ) trainer.fit() From e9448ca33901c176f5e765e4f97065958bed8136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:58:55 +0000 Subject: [PATCH 24/67] Update loader tests --- tests/data_tests/test_loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index ac850a1440..f2f2a8d238 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -39,7 +39,7 @@ def __init__(self, *args, **kwargs): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") - tokenizer = TTSTokenizer.init_from_config(c) + tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, compute_linear_spec=True, @@ -101,8 +101,8 @@ def test_loader(self): if self.ap.symmetric_norm: self.assertLessEqual(mel_input.max(), self.ap.max_norm) self.assertGreaterEqual( - mel_input.min(), -self.ap.max_norm - ) # pylint: disable=invalid-unary-operand-type + mel_input.min(), -self.ap.max_norm # pylint: disable=invalid-unary-operand-type + ) self.assertLess(mel_input.min(), 0) else: self.assertLessEqual(mel_input.max(), self.ap.max_norm) From c974633aa9d5254714ddf032aa1b441d31177745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:59:11 +0000 Subject: [PATCH 25/67] Update tests --- tests/inference_tests/test_synthesize.py | 12 ++-- tests/text_tests/test_characters.py | 4 +- tests/text_tests/test_phonemizer.py | 85 +++++++++++++++++++++--- tests/text_tests/test_tokenizer.py | 14 ++-- tests/tts_tests/test_glow_tts_train.py | 1 - tests/tts_tests/test_vits_train.py | 1 - 6 files changed, 90 insertions(+), 27 deletions(-) diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py index 635506ab21..42b7717281 100644 --- a/tests/inference_tests/test_synthesize.py +++ b/tests/inference_tests/test_synthesize.py @@ -19,9 +19,9 @@ def test_synthesize(): f'--text "This is an example." --out_path "{output_path}"' ) - # multi-speaker model - run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") - run_cli( - f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' - f'--text "This is an example." --out_path "{output_path}"' - ) + # multi-speaker SC-Glow model + # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") + # run_cli( + # f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' + # f'--text "This is an example." --out_path "{output_path}"' + # ) diff --git a/tests/text_tests/test_characters.py b/tests/text_tests/test_characters.py index ed84b5b40b..3f4086d59e 100644 --- a/tests/text_tests/test_characters.py +++ b/tests/text_tests/test_characters.py @@ -2,6 +2,8 @@ from TTS.tts.utils.text.characters import BaseCharacters, Graphemes, IPAPhonemes, create_graphemes, create_phonemes +# pylint: disable=protected-access + def test_make_symbols(): _ = create_phonemes() @@ -12,7 +14,7 @@ class BaseCharacterTest(unittest.TestCase): def setUp(self): self.characters_empty = BaseCharacters("", "", pad="", eos="", bos="", blank="", is_unique=True, is_sorted=True) - def test_default_character_sets(self): + def test_default_character_sets(self): # pylint: disable=no-self-use """Test initiation of default character sets""" _ = IPAPhonemes() _ = Graphemes() diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index aa7a54991a..512cc195f3 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -1,20 +1,38 @@ import unittest -from TTS.tts.utils.text.characters import BaseCharacters, Graphemes, IPAPhonemes, create_graphemes, create_phonemes from TTS.tts.utils.text.phonemizers import ESpeak, Gruut, JA_JP_Phonemizer, ZH_CN_Phonemizer -from TTS.tts.utils.text.tokenizer import TTSTokenizer -EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" +EXAMPLE_TEXTs = [ + "Recent research at Harvard has shown meditating", + "for as little as 8 weeks can actually increase, the grey matter", + "in the parts of the brain responsible", + "for emotional regulation and learning!", +] + + +EXPECTED_ESPEAK_PHONEMES = [ + "ɹ|ˈiː|s|ə|n|t ɹ|ɪ|s|ˈɜː|tʃ æ|t h|ˈɑːɹ|v|ɚ|d h|ɐ|z ʃ|ˈoʊ|n m|ˈɛ|d|ɪ|t|ˌeɪ|ɾ|ɪ|ŋ", + "f|ɔː|ɹ æ|z l|ˈɪ|ɾ|əl æ|z ˈeɪ|t w|ˈiː|k|s k|æ|n ˈæ|k|tʃ|uː|əl|i| ˈɪ|n|k|ɹ|iː|s, ð|ə ɡ|ɹ|ˈeɪ m|ˈæ|ɾ|ɚ", + "ɪ|n|ð|ə p|ˈɑːɹ|t|s ʌ|v|ð|ə b|ɹ|ˈeɪ|n ɹ|ɪ|s|p|ˈɑː|n|s|ə|b|əl", + "f|ɔː|ɹ ɪ|m|ˈoʊ|ʃ|ə|n|əl ɹ|ˌɛ|ɡ|j|uː|l|ˈeɪ|ʃ|ə|n|| æ|n|d l|ˈɜː|n|ɪ|ŋ!", +] + + +EXPECTED_ESPEAKNG_PHONEMES = [ + "ɹ|ˈiː|s|ə|n|t ɹ|ᵻ|s|ˈɜː|tʃ æ|t h|ˈɑːɹ|v|ɚ|d h|ɐ|z ʃ|ˈoʊ|n m|ˈɛ|d|ᵻ|t|ˌeɪ|ɾ|ɪ|ŋ", + "f|ɔː|ɹ æ|z l|ˈɪ|ɾ|əl æ|z ˈeɪ|t w|ˈiː|k|s k|æ|n ˈæ|k|tʃ|uː|əl|i| ˈɪ|ŋ|k|ɹ|iː|s, ð|ə ɡ|ɹ|ˈeɪ m|ˈæ|ɾ|ɚ", + "ɪ|n|ð|ə p|ˈɑːɹ|t|s ʌ|v|ð|ə b|ɹ|ˈeɪ|n ɹ|ᵻ|s|p|ˈɑː|n|s|ᵻ|b|əl", + "f|ɔː|ɹ ɪ|m|ˈoʊ|ʃ|ə|n|əl ɹ|ˌɛ|ɡ|j|ʊ|l|ˈeɪ|ʃ|ə|n|| æ|n|d l|ˈɜː|n|ɪ|ŋ!", +] class TestEspeakPhonemizer(unittest.TestCase): def setUp(self): - self.phonemizer = ESpeak(language="en-us") - self.EXPECTED_PHONEMES = "ɹ|ˈiː|s|ə|n|t ɹ|ɪ|s|ˈɜː|tʃ æ|t h|ˈɑːɹ|v|ɚ|d h|ɐ|z ʃ|ˈoʊ|n m|ˈɛ|d|ᵻ|t|ˌeɪ|ɾ|ɪ|ŋ f|ɔː|ɹ æ|z l|ˈɪ|ɾ|əl æ|z ˈeɪ|t w|ˈiː|k|s k|æ|n ˈæ|k|tʃ|uː|əl|i| ˈɪ|n|k|ɹ|iː|s, ð|ə ɡ|ɹ|ˈeɪ m|ˈæ|ɾ|ɚ|ɹ ɪ|n|ð|ə p|ˈɑːɹ|t|s ʌ|v|ð|ə b|ɹ|ˈeɪ|n ɹ|ɪ|s|p|ˈɑː|n|s|ə|b|əl f|ɔː|ɹ ɪ|m|ˈoʊ|ʃ|ə|n|əl ɹ|ˌɛ|ɡ|j|uː|l|ˈeɪ|ʃ|ə|n|| æ|n|d l|ˈɜː|n|ɪ|ŋ!" + self.phonemizer = ESpeak(language="en-us", backend="espeak") - def test_phonemize(self): - output = self.phonemizer.phonemize(EXAMPLE_TEXT, separator="|") - self.assertEqual(output, self.EXPECTED_PHONEMES) + for text, ph in zip(EXAMPLE_TEXTs, EXPECTED_ESPEAK_PHONEMES): + phonemes = self.phonemizer.phonemize(text) + self.assertEqual(phonemes, ph) # multiple punctuations text = "Be a voice, not an! echo?" @@ -48,14 +66,59 @@ def test_is_available(self): self.assertTrue(self.phonemizer.is_available()) +class TestEspeakNgPhonemizer(unittest.TestCase): + def setUp(self): + self.phonemizer = ESpeak(language="en-us", backend="espeak-ng") + + for text, ph in zip(EXAMPLE_TEXTs, EXPECTED_ESPEAKNG_PHONEMES): + phonemes = self.phonemizer.phonemize(text) + self.assertEqual(phonemes, ph) + + # multiple punctuations + text = "Be a voice, not an! echo?" + gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ?" + output = self.phonemizer.phonemize(text, separator="|") + output = output.replace("|", "") + self.assertEqual(output, gt) + + # not ending with punctuation + text = "Be a voice, not an! echo" + gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ" + output = self.phonemizer.phonemize(text, separator="") + self.assertEqual(output, gt) + + # extra space after the sentence + text = "Be a voice, not an! echo. " + gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ." + output = self.phonemizer.phonemize(text, separator="") + self.assertEqual(output, gt) + + def test_name(self): + self.assertEqual(self.phonemizer.name(), "espeak") + + def test_get_supported_languages(self): + self.assertIsInstance(self.phonemizer.supported_languages(), dict) + + def test_get_version(self): + self.assertIsInstance(self.phonemizer.version(), str) + + def test_is_available(self): + self.assertTrue(self.phonemizer.is_available()) + + class TestGruutPhonemizer(unittest.TestCase): def setUp(self): self.phonemizer = Gruut(language="en-us", use_espeak_phonemes=True, keep_stress=False) - self.EXPECTED_PHONEMES = "ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ| f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s, ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l| f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ!" + self.EXPECTED_PHONEMES = ["ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ", + "f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s, ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ", + "ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l", + "f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ!" + ] def test_phonemize(self): - output = self.phonemizer.phonemize(EXAMPLE_TEXT, separator="|") - self.assertEqual(output, self.EXPECTED_PHONEMES) + for text, ph in zip(EXAMPLE_TEXTs, self.EXPECTED_PHONEMES): + phonemes = self.phonemizer.phonemize(text, separator="|") + self.assertEqual(phonemes, ph) # multiple punctuations text = "Be a voice, not an! echo?" diff --git a/tests/text_tests/test_tokenizer.py b/tests/text_tests/test_tokenizer.py index 4d3fb0ce70..471745185f 100644 --- a/tests/text_tests/test_tokenizer.py +++ b/tests/text_tests/test_tokenizer.py @@ -1,6 +1,5 @@ import unittest from dataclasses import dataclass -from os import sep from coqpit import Coqpit @@ -13,7 +12,7 @@ class TestTTSTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes()) - self.ph = ESpeak("tr") + self.ph = ESpeak("tr", backend="espeak") self.tokenizer_ph = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph) def test_encode_decode_graphemes(self): @@ -54,12 +53,12 @@ def test_print_logs(self): def test_not_found_characters(self): self.ph = ESpeak("en-us") - self.tokenizer_local = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph) + tokenizer_local = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=self.ph) self.assertEqual(len(self.tokenizer.not_found_characters), 0) text = "Yolk of one egg beaten light" - ids = self.tokenizer_local.text_to_ids(text) - text_hat = self.tokenizer_local.ids_to_text(ids) - self.assertEqual(self.tokenizer_local.not_found_characters, ["̩"]) + ids = tokenizer_local.text_to_ids(text) + text_hat = tokenizer_local.ids_to_text(ids) + self.assertEqual(tokenizer_local.not_found_characters, ["̩"]) self.assertEqual(text_hat, "jˈoʊk ʌv wˈʌn ˈɛɡ bˈiːʔn lˈaɪt") def test_init_from_config(self): @@ -85,7 +84,8 @@ class TokenizerConfig(Coqpit): text_cleaner: str = "phoneme_cleaners" characters = Characters() - tokenizer_ph = TTSTokenizer.init_from_config(TokenizerConfig()) + tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig()) + tokenizer_ph.phonemizer.backend = "espeak" text = "Bu bir Örnek." text_ph = "" + self.ph.phonemize(text, separator="") + "" ids = tokenizer_ph.text_to_ids(text) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index e590107658..7796b76051 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -16,7 +16,6 @@ num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=True, phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py index 607f7b29de..25793c0603 100644 --- a/tests/tts_tests/test_vits_train.py +++ b/tests/tts_tests/test_vits_train.py @@ -16,7 +16,6 @@ num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=True, phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, From 848fd73acaab80ed9372c2c466de37af8b2da349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 12:59:28 +0000 Subject: [PATCH 26/67] Update spec extractor --- TTS/bin/extract_tts_spectrograms.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 7b489fd653..40079f1b46 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -13,6 +13,7 @@ from TTS.tts.datasets import TTSDataset, load_tts_samples from TTS.tts.models import setup_model from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters @@ -20,21 +21,20 @@ def setup_loader(ap, r, verbose=False): + tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( - r, - c.text_cleaner, + outputs_per_step=r, compute_linear_spec=False, - meta_data=meta_data, + samples=meta_data, + tokenizer=tokenizer, ap=ap, - characters=c.characters if "characters" in c.keys() else None, - add_blank=c["add_blank"] if "add_blank" in c.keys() else False, batch_group_size=0, - min_seq_len=c.min_seq_len, - max_seq_len=c.max_seq_len, + min_text_len=c.min_text_len, + max_text_len=c.max_text_len, + min_audio_len=c.min_audio_len, + max_audio_len=c.max_audio_len, phoneme_cache_path=c.phoneme_cache_path, - use_phonemes=c.use_phonemes, - phoneme_language=c.phoneme_language, - enable_eos_bos=c.enable_eos_bos_chars, + precompute_num_workers=0, use_noise_augment=False, verbose=verbose, speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None, @@ -44,7 +44,7 @@ def setup_loader(ap, r, verbose=False): if c.use_phonemes and c.compute_input_seq_cache: # precompute phonemes to have a better estimate of sequence lengths. dataset.compute_input_seq(c.num_loader_workers) - dataset.sort_and_filter_items(c.get("sort_by_audio_len", default=False)) + dataset.preprocess_samples() loader = DataLoader( dataset, @@ -75,8 +75,8 @@ def set_filename(wav_path, out_path): def format_data(data): # setup input data - text_input = data["text"] - text_lengths = data["text_lengths"] + text_input = data["token_id"] + text_lengths = data["token_id_lengths"] mel_input = data["mel"] mel_lengths = data["mel_lengths"] item_idx = data["item_idxs"] From 3a15e2f88727cfbdcd6d50173ee0b2f925d98b04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 13:01:53 +0000 Subject: [PATCH 27/67] Update ljspeech download --- recipes/ljspeech/download_ljspeech.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/ljspeech/download_ljspeech.sh b/recipes/ljspeech/download_ljspeech.sh index 14ef058da6..9468988a99 100644 --- a/recipes/ljspeech/download_ljspeech.sh +++ b/recipes/ljspeech/download_ljspeech.sh @@ -10,5 +10,5 @@ tar -xjf LJSpeech-1.1.tar.bz2 shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv -mv LJSpeech-1.1 $RUN_DIR/ +mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/ rm LJSpeech-1.1.tar.bz2 \ No newline at end of file From 9338c7b6c4d5c75107e0b370603d1cb65111ce71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 7 Dec 2021 13:02:02 +0000 Subject: [PATCH 28/67] Update pylintrc --- .pylintrc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pylintrc b/.pylintrc index 6e9f953edd..d5f9c4909c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -168,7 +168,8 @@ disable=missing-docstring, exception-escape, comprehension-escape, duplicate-code, - not-callable + not-callable, + import-outside-toplevel # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option From 672d766906cfe47be801d934c6fcd8dc8d314873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 8 Dec 2021 14:45:32 +0000 Subject: [PATCH 29/67] Update VCTK formatter --- TTS/tts/datasets/formatters.py | 37 +++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 1f23f85e6f..6dd91bc8c2 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -285,8 +285,10 @@ def brspeech(root_path, meta_file, ignored_speakers=None): return items -def vctk(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): - """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" +def vctk(root_path, meta_files=None, wavs_path="wav22", mic="mic2"): + """https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip""" + file_ext = 'flac' + test_speakers = meta_files items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: @@ -298,26 +300,33 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): continue with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] - wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") - items.append([text, wav_file, "VCTK_" + speaker_id]) - + # p280 has no mic2 recordings + if speaker_id == "p280": + wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_mic1.{file_ext}") + else: + wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}") + if os.path.exists(wav_file): + items.append([text, wav_file, "VCTK_" + speaker_id]) + else: + print(f" [!] wav files don't exist - {wav_file}") return items -def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): # pylint: disable=unused-argument +def vctk_old(root_path, meta_files=None, wavs_path="wav48"): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" + test_speakers = meta_files items = [] - txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) - for text_file in txt_files: - _, speaker_id, txt_file = os.path.relpath(text_file, root_path).split(os.sep) + meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + for meta_file in meta_files: + _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] - # ignore speakers - if isinstance(ignored_speakers, list): - if speaker_id in ignored_speakers: + if isinstance(test_speakers, list): # if is list ignore this speakers ids + if speaker_id in test_speakers: continue + with open(meta_file, "r", encoding="utf-8") as file_text: + text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") - items.append([None, wav_file, "VCTK_" + speaker_id]) - + items.append([text, wav_file, "VCTK_old_" + speaker_id]) return items From cecce069a8c32fa42c8f638a9bb693e731acb93e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 8 Dec 2021 14:45:57 +0000 Subject: [PATCH 30/67] Add file_ext args to resample.py --- TTS/bin/resample.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index 3c5ef29c21..c9f1166a64 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -26,6 +26,7 @@ def resample_file(func_args): --input_dir /root/LJSpeech-1.1/ --output_sr 22050 --output_dir /root/resampled_LJSpeech-1.1/ + --file_ext wav --n_jobs 24 """, formatter_class=RawTextHelpFormatter, @@ -55,6 +56,14 @@ def resample_file(func_args): help="Path of the destination folder. If not defined, the operation is done in place", ) + parser.add_argument( + "--file_ext", + type=str, + default="wav", + required=False, + help="Extension of the audio files to resample", + ) + parser.add_argument( "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores" ) @@ -67,7 +76,7 @@ def resample_file(func_args): args.input_dir = args.output_dir print("Resampling the audio files...") - audio_files = glob.glob(os.path.join(args.input_dir, "**/*.wav"), recursive=True) + audio_files = glob.glob(os.path.join(args.input_dir, f"**/*.{args.file_ext}"), recursive=True) print(f"Found {len(audio_files)} files...") audio_files = list(zip(audio_files, len(audio_files) * [args.output_sr])) with Pool(processes=args.n_jobs) as p: From 95df38c66007a95849600259a80f6c0f006f8f74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 8 Dec 2021 15:15:56 +0000 Subject: [PATCH 31/67] Update VCTK recipes --- TTS/tts/datasets/formatters.py | 2 +- .../speedy_speech/train_speedy_speech.py | 6 --- recipes/vctk/fast_pitch/train_fast_pitch.py | 43 +++++++++++------ recipes/vctk/fast_speech/train_fast_speech.py | 48 +++++++++++-------- recipes/vctk/glow_tts/train_glow_tts.py | 41 +++++++++++----- .../vctk/speedy_speech/train_speedy_speech.py | 44 ++++++++++------- .../vctk/tacotron-DDC/train_tacotron-DDC.py | 42 ++++++++++------ .../vctk/tacotron2-DDC/train_tacotron2-ddc.py | 41 ++++++++++------ recipes/vctk/tacotron2/train_tacotron2.py | 41 ++++++++++------ 9 files changed, 192 insertions(+), 116 deletions(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 6dd91bc8c2..7e47c44d98 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -285,7 +285,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None): return items -def vctk(root_path, meta_files=None, wavs_path="wav22", mic="mic2"): +def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic2"): """https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip""" file_ext = 'flac' test_speakers = meta_files diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py index 468e8a5f12..2f8896c577 100644 --- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py +++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py @@ -68,12 +68,6 @@ # Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) - -# load training samples -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) - # init model model = ForwardTTS(config, ap, tokenizer) diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py index f40587e091..f7a2ef068a 100644 --- a/recipes/vctk/fast_pitch/train_fast_pitch.py +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -6,6 +6,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -32,6 +33,7 @@ num_loader_workers=8, num_eval_loader_workers=4, compute_input_seq_cache=True, + precompute_num_workers=4, compute_f0=True, f0_cache_path=os.path.join(output_path, "f0_cache"), run_eval=True, @@ -39,23 +41,35 @@ epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=50, print_eval=False, mixed_precision=False, - sort_by_audio_len=True, - max_seq_len=500000, + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=500000, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, ) -# init audio processor -ap = AudioProcessor(**config.audio) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -65,16 +79,15 @@ config.model_args.num_speakers = speaker_manager.num_speakers # init model -model = ForwardTTS(config, speaker_manager) +model = ForwardTTS(config, ap, tokenizer, speaker_manager=speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() + diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py index b29888099a..853bbb545f 100644 --- a/recipes/vctk/fast_speech/train_fast_speech.py +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -6,6 +6,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -25,37 +26,48 @@ ) config = FastSpeechConfig( - run_name="fast_pitch_ljspeech", + run_name="fast_speech_vctk", audio=audio_config, batch_size=32, eval_batch_size=16, num_loader_workers=8, num_eval_loader_workers=4, compute_input_seq_cache=True, - compute_f0=True, - f0_cache_path=os.path.join(output_path, "f0_cache"), + precompute_num_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=50, print_eval=False, mixed_precision=False, - sort_by_audio_len=True, - max_seq_len=500000, + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=500000, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, ) -# init audio processor -ap = AudioProcessor(**config.audio) +## INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -65,16 +77,14 @@ config.model_args.num_speakers = speaker_manager.num_speakers # init model -model = ForwardTTS(config, speaker_manager) +model = ForwardTTS(config, ap, tokenizer, speaker_manager=speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) -trainer.fit() + +# AND... 3,2,1... 🚀 +trainer.fit() \ No newline at end of file diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py index 8c9f538865..30050ef535 100644 --- a/recipes/vctk/glow_tts/train_glow_tts.py +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -7,6 +7,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.glow_tts import GlowTTS from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor # set experiment paths @@ -32,6 +33,7 @@ eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, + precompute_num_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, @@ -45,12 +47,27 @@ output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=500000, ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -60,16 +77,14 @@ config.num_speakers = speaker_manager.num_speakers # init model -model = GlowTTS(config, speaker_manager) +model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) -trainer.fit() + +# AND... 3,2,1... 🚀 +trainer.fit() \ No newline at end of file diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py index 81f78d265b..85e347fc07 100644 --- a/recipes/vctk/speedy_speech/train_speedy_speech.py +++ b/recipes/vctk/speedy_speech/train_speedy_speech.py @@ -6,6 +6,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -32,30 +33,41 @@ num_loader_workers=8, num_eval_loader_workers=4, compute_input_seq_cache=True, - compute_f0=True, - f0_cache_path=os.path.join(output_path, "f0_cache"), + precompute_num_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=50, print_eval=False, mixed_precision=False, - sort_by_audio_len=True, - max_seq_len=500000, + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=500000, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, ) -# init audio processor -ap = AudioProcessor(**config.audio) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -65,16 +77,14 @@ config.model_args.num_speakers = speaker_manager.num_speakers # init model -model = ForwardTTS(config, speaker_manager) +model = ForwardTTS(config, ap, tokenizer, speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py index b0030f1749..7960b34bc4 100644 --- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py +++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py @@ -7,6 +7,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron import Tacotron from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -32,6 +33,7 @@ eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, + precompute_num_workers=4, run_eval=True, test_delay_epochs=-1, r=6, @@ -45,18 +47,30 @@ print_step=25, print_eval=False, mixed_precision=True, - sort_by_audio_len=True, - min_seq_len=0, - max_seq_len=44000 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=44000 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, # set this to enable multi-sepeaker training ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +## INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -65,16 +79,14 @@ speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) # init model -model = Tacotron(config, speaker_manager) +model = Tacotron(config, ap, tokenizer, speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py index 63efb78470..bc7951b572 100644 --- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -7,6 +7,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -44,9 +45,10 @@ print_step=150, print_eval=False, mixed_precision=True, - sort_by_audio_len=True, - min_seq_len=14800, - max_seq_len=22050 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=44000 * 10, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, # set this to enable multi-sepeaker training @@ -60,10 +62,21 @@ lr=3e-5, ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +# INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -72,16 +85,14 @@ speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) # init model -model = Tacotron2(config, speaker_manager) +model = Tacotron2(config, ap, tokenizer, speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py index 346d650b8f..82dedade77 100644 --- a/recipes/vctk/tacotron2/train_tacotron2.py +++ b/recipes/vctk/tacotron2/train_tacotron2.py @@ -7,6 +7,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -44,9 +45,10 @@ print_step=150, print_eval=False, mixed_precision=True, - sort_by_audio_len=True, - min_seq_len=14800, - max_seq_len=22050 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio + min_text_len=0, + max_text_len=500, + min_audio_len=0, + max_audio_len=44000 * 10, output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, # set this to enable multi-sepeaker training @@ -60,10 +62,21 @@ lr=3e-5, ) -# init audio processor -ap = AudioProcessor(**config.audio.to_dict()) +## INITIALIZE THE AUDIO PROCESSOR +# Audio processor is used for feature extraction and audio I/O. +# It mainly serves to the dataloader and the training loggers. +ap = AudioProcessor.init_from_config(config) -# load training samples +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# If characters are not defined in the config, default characters are passed to the config +tokenizer, config = TTSTokenizer.init_from_config(config) + +# LOAD DATA SAMPLES +# Each sample is a list of ```[text, audio_file_path, speaker_name]``` +# You can define your custom sample loader returning the list of samples. +# Or define your custom formatter and pass it to the `load_tts_samples`. +# Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training @@ -72,16 +85,14 @@ speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) # init model -model = Tacotron2(config, speaker_manager) +model = Tacotron2(config, ap, tokenizer, speaker_manager) -# init the trainer and 🚀 +# INITIALIZE THE TRAINER +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training, etc. trainer = Trainer( - TrainingArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) + +# AND... 3,2,1... 🚀 trainer.fit() From b4cbf2e62fb4b3d5e6fd00a7f83a08a41cdc957a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 8 Dec 2021 15:16:16 +0000 Subject: [PATCH 32/67] Fix `too many open files` --- TTS/tts/datasets/dataset.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 229f59c7a0..50fd97d997 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -11,6 +11,10 @@ from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor from TTS.utils.audio import AudioProcessor +# to prevent too many open files error as suggested here +# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936 +torch.multiprocessing.set_sharing_strategy('file_system') + def _parse_sample(item): language_name = None From bbad03ed843c705bb8848e630303e1bac02db55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 8 Dec 2021 15:18:14 +0000 Subject: [PATCH 33/67] Update recipes README.md --- recipes/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/recipes/README.md b/recipes/README.md index cf3f3de94d..21a6727d8b 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -11,6 +11,12 @@ $ sh ./recipes//download_.sh $ python recipes///train.py ``` +For some datasets you might need to resample the audio files. For example, VCTK dataset can be resampled to 22050Hz as follows. + +```console +python TTS/bin/resample.py --input_dir recipes/vctk/VCTK/wav48_silence_trimmed --output_sr 22050 --output_dir recipes/vctk/VCTK/wav48_silence_trimmed --n_jobs 8 --file_ext flac +``` + If you train a new model using TTS, feel free to share your training to expand the list of recipes. You can also open a new discussion and share your progress with the 🐸 community. \ No newline at end of file From 13a8f7151e2d2501d2a1ddee8a27482a23cd1c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 7 Jan 2022 15:32:31 +0000 Subject: [PATCH 34/67] Delete `use_espeak_phonemes` from tests --- tests/tts_tests/test_vits_d-vectors_train.py | 1 - tests/tts_tests/test_vits_multilingual_train.py | 1 - tests/tts_tests/test_vits_speaker_emb_train.py | 1 - 3 files changed, 3 deletions(-) diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py index 213669f50b..5fd9cbc1bd 100644 --- a/tests/tts_tests/test_vits_d-vectors_train.py +++ b/tests/tts_tests/test_vits_d-vectors_train.py @@ -16,7 +16,6 @@ num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=True, phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 50cccca500..577db8a081 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -33,7 +33,6 @@ num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=True, phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py index 6cc1dabd68..b9a1102e49 100644 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -16,7 +16,6 @@ num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=True, phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, From 90fe858429fc36ad5c8bbd30c2aaee93710fa696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 7 Jan 2022 15:33:24 +0000 Subject: [PATCH 35/67] =?UTF-8?q?Fix=20synthesis.py=20=F0=9F=94=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/tts/utils/synthesis.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 47ea0e934c..72cd8403a8 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -175,8 +175,6 @@ def synthesis( text, CONFIG, use_cuda, - ap, - tokenizer, speaker_id=None, style_wav=None, use_griffin_lim=False, @@ -232,10 +230,10 @@ def synthesis( if isinstance(style_wav, dict): style_mel = style_wav else: - style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) + style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda) # convert text to sequence of token IDs text_inputs = np.asarray( - tokenizer.text_to_ids(text), + model.tokenizer.text_to_ids(text), dtype=np.int32, ) # pass tensors to backend From bddcc9d7cc9f26d3073d18a517e71292739ae403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 7 Jan 2022 15:38:08 +0000 Subject: [PATCH 36/67] Fixes small compat. issues --- TTS/tts/datasets/__init__.py | 4 +-- TTS/tts/datasets/dataset.py | 4 +-- TTS/tts/datasets/formatters.py | 2 +- TTS/tts/models/base_tts.py | 8 +++-- TTS/tts/utils/languages.py | 9 ++++++ TTS/tts/utils/speakers.py | 32 ++++++++++++------- recipes/vctk/fast_pitch/train_fast_pitch.py | 1 - recipes/vctk/fast_speech/train_fast_speech.py | 2 +- recipes/vctk/glow_tts/train_glow_tts.py | 2 +- tests/text_tests/test_phonemizer.py | 9 +++--- 10 files changed, 46 insertions(+), 27 deletions(-) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 4e8a2485db..40eed7e365 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -88,8 +88,8 @@ def load_tts_samples( meta_data_eval_all += meta_data_eval meta_data_train_all += meta_data_train # load attention masks for the duration predictor training - if d.meta_file_attn_mask: - meta_data = dict(load_attention_mask_meta_data(d["meta_file_attn_mask"])) + if dataset.meta_file_attn_mask: + meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) for idx, ins in enumerate(meta_data_train_all): attn_file = meta_data[ins[1]].strip() meta_data_train_all[idx].append(attn_file) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 50fd97d997..5fab71088d 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -13,7 +13,7 @@ # to prevent too many open files error as suggested here # https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936 -torch.multiprocessing.set_sharing_strategy('file_system') +torch.multiprocessing.set_sharing_strategy("file_system") def _parse_sample(item): @@ -208,7 +208,7 @@ def get_token_ids(self, idx, text): def load_data(self, idx): item = self.samples[idx] - text, wav_file, speaker_name, _, attn_file = _parse_sample(item) + text, wav_file, speaker_name, language_name, attn_file = _parse_sample(item) raw_text = text wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 7e47c44d98..68c07eaa11 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -287,7 +287,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None): def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic2"): """https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip""" - file_ext = 'flac' + file_ext = "flac" test_speakers = meta_files items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 5986232231..9a6a56df76 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -261,7 +261,7 @@ def get_data_loader( speaker_id_mapping = None d_vector_mapping = None - # setup custom symbols if needed + # setup multi-lingual attributes if hasattr(self, "language_manager"): language_id_mapping = ( self.language_manager.language_id_mapping if self.args.use_language_embedding else None @@ -290,6 +290,7 @@ def get_data_loader( speaker_id_mapping=speaker_id_mapping, d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, tokenizer=self.tokenizer, + language_id_mapping=language_id_mapping, ) # wait all the DDP process to be ready @@ -303,6 +304,7 @@ def get_data_loader( sampler = DistributedSampler(dataset) if num_gpus > 1 else None # Weighted samplers + # TODO: make this DDP amenable assert not ( num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False) ), "language_weighted_sampler is not supported with DistributedSampler" @@ -313,10 +315,10 @@ def get_data_loader( if sampler is None: if getattr(config, "use_language_weighted_sampler", False): print(" > Using Language weighted sampler") - sampler = get_language_weighted_sampler(dataset.items) + sampler = get_language_weighted_sampler(dataset.samples) elif getattr(config, "use_speaker_weighted_sampler", False): print(" > Using Language weighted sampler") - sampler = get_speaker_weighted_sampler(dataset.items) + sampler = get_speaker_weighted_sampler(dataset.samples) loader = DataLoader( dataset, diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index fc7eec575e..5cecbe6908 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -98,6 +98,15 @@ def save_language_ids_to_file(self, file_path: str) -> None: """ self._save_json(file_path, self.language_id_mapping) + @staticmethod + def init_from_config(config: Coqpit) -> "LanguageManager": + """Initialize the language manager from a Coqpit config. + + Args: + config (Coqpit): Coqpit config. + """ + return LanguageManager(config=config) + def _set_file_path(path): """Find the language_ids.json under the given path or the above it. diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 9d2e6fe30c..7572e888d9 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -9,7 +9,7 @@ from coqpit import Coqpit from torch.utils.data.sampler import WeightedRandomSampler -from TTS.config import load_config +from TTS.config import get_from_config_or_model_args_with_default, load_config from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model from TTS.utils.audio import AudioProcessor @@ -331,19 +331,27 @@ def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = SpeakerEncoder: Speaker encoder object. """ speaker_manager = None - if hasattr(config, "use_speaker_embedding") and config.use_speaker_embedding: + if get_from_config_or_model_args_with_default(config, "use_speaker_embedding", False): if samples: speaker_manager = SpeakerManager(data_items=samples) - if config.get("speaker_file", None): - speaker_manager = SpeakerManager(speaker_id_file_path=config.speaker_file) - if config.get("speakers_file", None): - speaker_manager = SpeakerManager(speaker_id_file_path=config.speakers_file) - - if hasattr(config, "use_d_vector_file") and config.use_d_vector_file: - if config.get("speakers_file", None): - speaker_manager = SpeakerManager(d_vectors_file_path=config.speaker_file) - if config.get("d_vector_file", None): - speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) + if get_from_config_or_model_args_with_default(config, "speaker_file", None): + speaker_manager = SpeakerManager( + speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None) + ) + if get_from_config_or_model_args_with_default(config, "speakers_file", None): + speaker_manager = SpeakerManager( + speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speakers_file", None) + ) + + if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False): + if get_from_config_or_model_args_with_default(config, "speakers_file", None): + speaker_manager = SpeakerManager( + d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None) + ) + if get_from_config_or_model_args_with_default(config, "d_vector_file", None): + speaker_manager = SpeakerManager( + d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None) + ) return speaker_manager diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py index f7a2ef068a..4d9cc10d1f 100644 --- a/recipes/vctk/fast_pitch/train_fast_pitch.py +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -90,4 +90,3 @@ # AND... 3,2,1... 🚀 trainer.fit() - diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py index 853bbb545f..1dcab98285 100644 --- a/recipes/vctk/fast_speech/train_fast_speech.py +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -87,4 +87,4 @@ ) # AND... 3,2,1... 🚀 -trainer.fit() \ No newline at end of file +trainer.fit() diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py index 30050ef535..e35e552db7 100644 --- a/recipes/vctk/glow_tts/train_glow_tts.py +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -87,4 +87,4 @@ ) # AND... 3,2,1... 🚀 -trainer.fit() \ No newline at end of file +trainer.fit() diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index 512cc195f3..9b619f6ea7 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -109,10 +109,11 @@ def test_is_available(self): class TestGruutPhonemizer(unittest.TestCase): def setUp(self): self.phonemizer = Gruut(language="en-us", use_espeak_phonemes=True, keep_stress=False) - self.EXPECTED_PHONEMES = ["ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ", - "f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s, ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ", - "ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l", - "f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ!" + self.EXPECTED_PHONEMES = [ + "ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ", + "f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s, ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ", + "ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l", + "f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ!", ] def test_phonemize(self): From c35b0c9014db302a0cd53740bf0bc095a299ee13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 7 Jan 2022 15:38:29 +0000 Subject: [PATCH 37/67] Update Vits for the new model API --- TTS/tts/models/vits.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 957994f989..83d2f9f92d 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -275,7 +275,12 @@ class Vits(BaseTTS): # pylint: disable=dangerous-default-value def __init__( - self, config: Coqpit, ap: "AudioProcessor", tokenizer: "TTSTokenizer", speaker_manager: SpeakerManager = None, language_manager: LanguageManager = None + self, + config: Coqpit, + ap: "AudioProcessor" = None, + tokenizer: "TTSTokenizer" = None, + speaker_manager: SpeakerManager = None, + language_manager: LanguageManager = None, ): super().__init__(config, ap, tokenizer, speaker_manager) @@ -284,8 +289,6 @@ def __init__( self.speaker_manager = speaker_manager self.language_manager = language_manager - self.args = args - self.init_multispeaker(config) self.init_multilingual(config) @@ -306,6 +309,7 @@ def __init__( self.args.num_layers_text_encoder, self.args.kernel_size_text_encoder, self.args.dropout_p_text_encoder, + language_emb_dim=self.embedded_language_dim, ) self.posterior_encoder = PosteriorEncoder( @@ -344,6 +348,7 @@ def __init__( 3, self.args.dropout_p_duration_predictor, cond_channels=self.embedded_speaker_dim, + language_emb_dim=self.embedded_language_dim, ) self.waveform_decoder = HifiganGenerator( @@ -878,7 +883,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s return self._log(self.ap, batch, outputs, "eval") @torch.no_grad() - def test_run(self) -> Tuple[Dict, Dict]: + def test_run(self, assets) -> Tuple[Dict, Dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -898,7 +903,7 @@ def test_run(self) -> Tuple[Dict, Dict]: aux_inputs["text"], self.config, "cuda" in str(next(self.parameters()).device), - ap, + self.ap, speaker_id=aux_inputs["speaker_id"], d_vector=aux_inputs["d_vector"], style_wav=aux_inputs["style_wav"], @@ -1001,7 +1006,8 @@ def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict] ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) - return Vits(new_config, ap, tokenizer, speaker_manager) + language_manager = LanguageManager.init_from_config(config) + return Vits(new_config, ap, tokenizer, speaker_manager, language_manager) class VitsCharacters(BaseCharacters): From b2e1420e6b4d787eeb56e2958b756211c7c04b52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 7 Jan 2022 15:38:57 +0000 Subject: [PATCH 38/67] Update train_tts for the new API --- TTS/bin/train_tts.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index f053e9d75c..9a4a430a20 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -42,36 +42,8 @@ def main(): # load training samples train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True) - # setup audio processor - ap = AudioProcessor(**config.audio) - - # init speaker manager - if check_config_and_model_args(config, "use_speaker_embedding", True): - speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) - if hasattr(config, "model_args"): - config.model_args.num_speakers = speaker_manager.num_speakers - else: - config.num_speakers = speaker_manager.num_speakers - elif check_config_and_model_args(config, "use_d_vector_file", True): - speaker_manager = SpeakerManager(d_vectors_file_path=get_from_config_or_model_args(config, "d_vector_file")) - if hasattr(config, "model_args"): - config.model_args.num_speakers = speaker_manager.num_speakers - else: - config.num_speakers = speaker_manager.num_speakers - else: - speaker_manager = None - - if hasattr(config, "use_language_embedding") and config.use_language_embedding: - language_manager = LanguageManager(config=config) - if hasattr(config, "model_args"): - config.model_args.num_languages = language_manager.num_languages - else: - config.num_languages = language_manager.num_languages - else: - language_manager = None - # init the model from config - model = setup_model(config, speaker_manager, language_manager) + model = setup_model(config, train_samples + eval_samples) # init the trainer and 🚀 trainer = Trainer( From 83b6cf5876ca13205de72cfdb9c867f453aab704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 12 Jan 2022 11:35:52 +0000 Subject: [PATCH 39/67] Extend glow_tts model tests --- TTS/tts/models/glow_tts.py | 61 +++++-- tests/tts_tests/test_glow_tts.py | 291 +++++++++++++++++++++++++++---- 2 files changed, 298 insertions(+), 54 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 3dd8d5c836..190c699e01 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -40,11 +40,20 @@ class GlowTTS(BaseTTS): Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments. Examples: + Init only model layers. + >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig >>> from TTS.tts.models.glow_tts import GlowTTS - >>> config = GlowTTSConfig() + >>> config = GlowTTSConfig(num_chars=2) >>> model = GlowTTS(config) + Fully init a model ready for action. All the class attributes and class members + (e.g Tokenizer, AudioProcessor, etc.). are initialized internally based on config values. + + >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig + >>> from TTS.tts.models.glow_tts import GlowTTS + >>> config = GlowTTSConfig() + >>> model = GlowTTS.init_from_config(config, verbose=False) """ def __init__( @@ -98,25 +107,23 @@ def __init__( def init_multispeaker(self, config: Coqpit): """Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding - vector dimension in the network. If model uses d-vectors, then it only sets the expected dimension. + vector dimension to the encoder layer channel size. If model uses d-vectors, then it only sets + speaker embedding vector dimension to the d-vector dimension from the config. Args: config (Coqpit): Model configuration. """ self.embedded_speaker_dim = 0 - # init speaker manager - if self.speaker_manager is None and (self.use_speaker_embedding or self.use_d_vector_file): - raise ValueError( - " > SpeakerManager is not provided. You must provide the SpeakerManager before initializing a multi-speaker model." - ) # set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager if self.speaker_manager is not None: self.num_speakers = self.speaker_manager.num_speakers # set ultimate speaker embedding size - if config.use_speaker_embedding or config.use_d_vector_file: + if config.use_d_vector_file: self.embedded_speaker_dim = ( config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) + if self.speaker_manager is not None: + assert config.d_vector_dim == self.speaker_manager.d_vector_dim, " [!] d-vector dimension mismatch b/w config and speaker manager." # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: print(" > Init speaker_embedding layer.") @@ -184,12 +191,33 @@ def forward( self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=dangerous-default-value """ - Shapes: - - x: :math:`[B, T]` - - x_lenghts::math:`B` - - y: :math:`[B, T, C]` - - y_lengths::math:`B` - - g: :math:`[B, C] or B` + Args: + x (torch.Tensor): + Input text sequence ids. :math:`[B, T_en]` + + x_lengths (torch.Tensor): + Lengths of input text sequences. :math:`[B]` + + y (torch.Tensor): + Target mel-spectrogram frames. :math:`[B, T_de, C_mel]` + + y_lengths (torch.Tensor): + Lengths of target mel-spectrogram frames. :math:`[B]` + + aux_input (Dict): + Auxiliary inputs. `d_vectors` is speaker embedding vectors for a multi-speaker model. + :math:`[B, D_vec]`. `speaker_ids` is speaker ids for a multi-speaker model usind speaker-embedding + layer. :math:`B` + + Returns: + Dict: + - z: :math: `[B, T_de, C]` + - logdet: :math:`B` + - y_mean: :math:`[B, T_de, C]` + - y_log_scale: :math:`[B, T_de, C]` + - alignments: :math:`[B, T_en, T_de]` + - durations_log: :math:`[B, T_en, 1]` + - total_durations_log: :math:`[B, T_en, 1]` """ # [B, T, C] -> [B, C, T] y = y.transpose(1, 2) @@ -508,17 +536,18 @@ def on_train_step_start(self, trainer): self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps @staticmethod - def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): """Initiate model from config Args: config (VitsConfig): Model config. samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. Defaults to None. + verbose (bool): If True, print init messages. Defaults to True. """ from TTS.utils.audio import AudioProcessor - ap = AudioProcessor.init_from_config(config) + ap = AudioProcessor.init_from_config(config, verbose) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) return GlowTTS(new_config, ap, tokenizer, speaker_manager) diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index 82d0ec3b78..e97b793a67 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -1,11 +1,13 @@ import copy import os import unittest +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.logging.tensorboard_logger import TensorboardLogger import torch from torch import optim -from tests import get_tests_input_path +from tests import get_tests_data_path, get_tests_input_path, get_tests_output_path from TTS.tts.configs.glow_tts_config import GlowTTSConfig from TTS.tts.layers.losses import GlowTTSLoss from TTS.tts.models.glow_tts import GlowTTS @@ -28,36 +30,211 @@ def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) -class GlowTTSTrainTest(unittest.TestCase): - @staticmethod - def test_train_step(): +class TestGlowTTS(unittest.TestCase): + def _create_inputs(self): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) speaker_ids = torch.randint(0, 5, (8,)).long().to(device) + return input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids - criterion = GlowTTSLoss() + def _check_parameter_changes(self, model, model_ref): + count = 0 + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) + count += 1 - # model to train + def test_init_multispeaker(self): + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config) + # speaker embedding with default speaker_embedding_dim + config.use_speaker_embedding = True + config.num_speakers = 5 + config.d_vector_dim = None + model.init_multispeaker(config) + self.assertEqual(model.c_in_channels, model.hidden_channels_enc) + # use external speaker embeddings with speaker_embedding_dim = 301 + config = GlowTTSConfig(num_chars=32) + config.use_d_vector_file = True + config.d_vector_dim = 301 + model = GlowTTS(config) + model.init_multispeaker(config) + self.assertEqual(model.c_in_channels, 301) + # use speaker embedddings by the provided speaker_manager + config = GlowTTSConfig(num_chars=32) + config.use_speaker_embedding = True + config.speakers_file = os.path.join(get_tests_data_path(), "ljspeech", "speakers.json") + speaker_manager = SpeakerManager.init_from_config(config) + model = GlowTTS(config) + model.speaker_manager = speaker_manager + model.init_multispeaker(config) + self.assertEqual(model.c_in_channels, model.hidden_channels_enc) + self.assertEqual(model.num_speakers, speaker_manager.num_speakers) + # use external speaker embeddings by the provided speaker_manager + config = GlowTTSConfig(num_chars=32) + config.use_d_vector_file = True + config.d_vector_dim = 256 + config.d_vector_file = os.path.join(get_tests_data_path(), "dummy_speakers.json") + speaker_manager = SpeakerManager.init_from_config(config) + model = GlowTTS(config) + model.speaker_manager = speaker_manager + model.init_multispeaker(config) + self.assertEqual(model.c_in_channels, speaker_manager.d_vector_dim) + self.assertEqual(model.num_speakers, speaker_manager.num_speakers) + + def test_unlock_act_norm_layers(self): config = GlowTTSConfig(num_chars=32) model = GlowTTS(config).to(device) + model.unlock_act_norm_layers() + for f in model.decoder.flows: + if getattr(f, "set_ddi", False): + self.assertFalse(f.initialized) - # reference model to compare model weights - model_ref = GlowTTS(config).to(device) + def test_lock_act_norm_layers(self): + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) + model.lock_act_norm_layers() + for f in model.decoder.flows: + if getattr(f, "set_ddi", False): + self.assertTrue(f.initialized) + + def test_forward(self): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + # create model + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) + model.train() + print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + # inference encoder and decoder with MAS + y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths) + self.assertEqual(y["z"].shape, mel_spec.shape) + self.assertEqual(y["logdet"].shape, torch.Size([8])) + self.assertEqual(y["y_mean"].shape, mel_spec.shape) + self.assertEqual(y["y_log_scale"].shape, mel_spec.shape) + self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],)) + self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,)) + self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,)) + + def test_forward_with_d_vector(self): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + d_vector = torch.rand(8, 256).to(device) + # create model + config = GlowTTSConfig( + num_chars=32, + use_d_vector_file=True, + d_vector_dim=256, + d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + ) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + model.train() + print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + # inference encoder and decoder with MAS + y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector}) + self.assertEqual(y["z"].shape, mel_spec.shape) + self.assertEqual(y["logdet"].shape, torch.Size([8])) + self.assertEqual(y["y_mean"].shape, mel_spec.shape) + self.assertEqual(y["y_log_scale"].shape, mel_spec.shape) + self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],)) + self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,)) + self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,)) + def test_forward_with_speaker_id(self): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + speaker_ids = torch.randint(0, 24, (8,)).long().to(device) + # create model + config = GlowTTSConfig( + num_chars=32, + use_speaker_embedding=True, + num_speakers=24, + ) + model = GlowTTS.init_from_config(config, verbose=False).to(device) model.train() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + # inference encoder and decoder with MAS + y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"speaker_ids": speaker_ids}) + self.assertEqual(y["z"].shape, mel_spec.shape) + self.assertEqual(y["logdet"].shape, torch.Size([8])) + self.assertEqual(y["y_mean"].shape, mel_spec.shape) + self.assertEqual(y["y_log_scale"].shape, mel_spec.shape) + self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],)) + self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,)) + self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,)) + + def _assert_inference_outputs(self, outputs, input_dummy, mel_spec): + output_shape = outputs["model_outputs"].shape + self.assertEqual(outputs["model_outputs"].shape[::2] , mel_spec.shape[::2]) + self.assertEqual(outputs["logdet"], None) + self.assertEqual(outputs["y_mean"].shape, output_shape) + self.assertEqual(outputs["y_log_scale"].shape, output_shape) + self.assertEqual(outputs["alignments"].shape, output_shape[:2] + (input_dummy.shape[1],)) + self.assertEqual(outputs["durations_log"].shape, input_dummy.shape + (1,)) + self.assertEqual(outputs["total_durations_log"].shape, input_dummy.shape + (1,)) + + def test_inference(self): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) + model.eval() + outputs = model.inference(input_dummy, {"x_lengths": input_lengths}) + self._assert_inference_outputs(outputs, input_dummy, mel_spec) + + def test_inference_with_d_vector(self): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + d_vector = torch.rand(8, 256).to(device) + config = GlowTTSConfig(num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json")) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + model.eval() + outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "d_vectors": d_vector}) + self._assert_inference_outputs(outputs, input_dummy, mel_spec) + + def test_inference_with_speaker_ids(self): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + speaker_ids = torch.randint(0, 24, (8,)).long().to(device) + # create model + config = GlowTTSConfig( + num_chars=32, + use_speaker_embedding=True, + num_speakers=24, + ) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "speaker_ids": speaker_ids}) + self._assert_inference_outputs(outputs, input_dummy, mel_spec) + + def test_inference_with_MAS(self): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + # create model + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) + model.eval() + # inference encoder and decoder with MAS + y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths) + y2 = model.decoder_inference(mel_spec, mel_lengths) + assert ( + y2["model_outputs"].shape == y["model_outputs"].shape + ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( + y["model_outputs"].shape, y2["model_outputs"].shape + ) + def test_train_step(self): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + criterion = GlowTTSLoss() + # model to train + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) + # reference model to compare model weights + model_ref = GlowTTS(config).to(device) + model.train() + print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) # pass the state to ref model model_ref.load_state_dict(copy.deepcopy(model.state_dict())) - count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=0.001) for _ in range(5): optimizer.zero_grad() @@ -75,40 +252,78 @@ def test_train_step(): loss = loss_dict["loss"] loss.backward() optimizer.step() - # check parameter changes - count = 0 - for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) - count += 1 + self._check_parameter_changes(model, model_ref) + + def test_train_eval_log(self): + input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs() + batch = {} + batch["text_input"] = input_dummy + batch["text_lengths"] = input_lengths + batch["mel_lengths"] = mel_lengths + batch["mel_input"] = mel_spec + batch["d_vectors"] = None + batch["speaker_ids"] = None + config = GlowTTSConfig(num_chars=32) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + model.run_data_dep_init = False + model.train() + logger = TensorboardLogger(log_dir=os.path.join(get_tests_output_path(), "dummy_glow_tts_logs"), model_name = "glow_tts_test_train_log") + criterion = model.get_criterion() + outputs, _ = model.train_step(batch, criterion) + model.train_log(batch, outputs, logger, None, 1) + model.eval_log(batch, outputs, logger, None, 1) + logger.finish() + def test_test_run(self): + config = GlowTTSConfig(num_chars=32) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + model.run_data_dep_init = False + model.eval() + test_figures, test_audios = model.test_run(None) + self.assertTrue(test_figures is not None) + self.assertTrue(test_audios is not None) -class GlowTTSInferenceTest(unittest.TestCase): - @staticmethod - def test_inference(): - input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 129, (8,)).long().to(device) - input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_lengths = torch.randint(20, 30, (8,)).long().to(device) - speaker_ids = torch.randint(0, 5, (8,)).long().to(device) + def test_load_checkpoint(self): + chkp_path = os.path.join(get_tests_output_path(), "dummy_glow_tts_checkpoint.pth") + config = GlowTTSConfig(num_chars=32) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + chkp = {} + chkp["model"] = model.state_dict() + torch.save(chkp, chkp_path) + model.load_checkpoint(config, chkp_path) + self.assertTrue(model.training) + model.load_checkpoint(config, chkp_path, eval=True) + self.assertFalse(model.training) - # create model + def test_get_criterion(self): config = GlowTTSConfig(num_chars=32) - model = GlowTTS(config).to(device) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + criterion = model.get_criterion() + self.assertTrue(criterion is not None) - model.eval() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + def test_init_from_config(self): + config = GlowTTSConfig(num_chars=32) + model = GlowTTS.init_from_config(config, verbose=False).to(device) - # inference encoder and decoder with MAS - y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths) + config = GlowTTSConfig(num_chars=32, num_speakers=2) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + self.assertTrue(model.num_speakers == 2) + self.assertTrue(not hasattr(model, "emb_g")) - y2 = model.decoder_inference(mel_spec, mel_lengths) + config = GlowTTSConfig(num_chars=32, num_speakers=2, use_speaker_embedding=True) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + self.assertTrue(model.num_speakers == 2) + self.assertTrue(hasattr(model, "emb_g")) + + config = GlowTTSConfig(num_chars=32, num_speakers=2, use_speaker_embedding=True, speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json")) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + self.assertTrue(model.num_speakers == 10) + self.assertTrue(hasattr(model, "emb_g")) + + config = GlowTTSConfig(num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json")) + model = GlowTTS.init_from_config(config, verbose=False).to(device) + self.assertTrue(model.num_speakers == 1) + self.assertTrue(not hasattr(model, "emb_g")) + self.assertTrue(model.c_in_channels == config.d_vector_dim) - assert ( - y2["model_outputs"].shape == y["model_outputs"].shape - ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( - y["model_outputs"].shape, y2["model_outputs"].shape - ) From 5a1d2dedca72248d0b5d75d97172ab87a22b6f39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 12 Jan 2022 11:36:09 +0000 Subject: [PATCH 40/67] Add verbose option to AudioProcessor --- TTS/utils/audio.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 55ce49b508..e92acf574e 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -380,10 +380,10 @@ def __init__( self.symmetric_norm = None @staticmethod - def init_from_config(config: "Coqpit"): + def init_from_config(config: "Coqpit", verbose=True): if "audio" in config: - return AudioProcessor(**config.audio) - return AudioProcessor(**config) + return AudioProcessor(verbose=verbose, **config.audio) + return AudioProcessor(verbose=verbose, **config) ### setting up the parameters ### def _build_mel_basis( From 3c9e5188a23d95b4ebdcb6001b9f5e836ada0aef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 12 Jan 2022 11:36:25 +0000 Subject: [PATCH 41/67] Fix tokenizer init_from_config --- TTS/tts/utils/text/tokenizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index 3f416bbb86..f84a51eed8 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -146,8 +146,9 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): the config values. Defaults to None. """ # init cleaners + text_cleaner = None if isinstance(config.text_cleaner, (str, list)): - text_cleaner = getattr(cleaners, config.text_cleaner) + text_cleaner = getattr(config, "text_cleaner") # init characters if characters is None: From 9d9a5b33a082badb7f9a1e788031322332efa339 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 12 Jan 2022 11:36:46 +0000 Subject: [PATCH 42/67] Fix glow_tts_config missing field --- TTS/tts/configs/glow_tts_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index ce8eee6dfa..f42f3e5a51 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -153,6 +153,7 @@ class GlowTTSConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False + speakers_file: str = None use_d_vector_file: bool = False d_vector_file: str = False From 79a5400e0ab894319bc0550e8ef900cd81f04544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 12 Jan 2022 11:37:02 +0000 Subject: [PATCH 43/67] Add get_tests_data_path --- tests/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/__init__.py b/tests/__init__.py index 0a0c3379c3..8906c8c796 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -26,6 +26,11 @@ def get_tests_input_path(): return os.path.join(get_tests_path(), "inputs") +def get_tests_data_path(): + """Returns the path to the test data directory.""" + return os.path.join(get_tests_path(), "data") + + def get_tests_output_path(): """Returns the path to the directory for test outputs.""" return os.path.join(get_tests_path(), "outputs") From 09195786ec67a309ee2c58c007cbe937310945b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 12 Jan 2022 14:30:53 +0000 Subject: [PATCH 44/67] Make lint --- TTS/bin/train_tts.py | 5 +---- TTS/tts/datasets/formatters.py | 3 +-- TTS/tts/models/glow_tts.py | 4 +++- TTS/tts/models/vits.py | 14 ++++++++---- TTS/tts/utils/synthesis.py | 4 ---- TTS/tts/utils/text/tokenizer.py | 2 +- TTS/utils/synthesizer.py | 9 ++------ tests/tts_tests/test_glow_tts.py | 38 +++++++++++++++++++++++--------- 8 files changed, 46 insertions(+), 33 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 9a4a430a20..6477e75b99 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,12 +1,9 @@ import os -from TTS.config import check_config_and_model_args, get_from_config_or_model_args, load_config, register_config +from TTS.config import load_config, register_config from TTS.trainer import Trainer, TrainingArgs from TTS.tts.datasets import load_tts_samples from TTS.tts.models import setup_model -from TTS.tts.utils.languages import LanguageManager -from TTS.tts.utils.speakers import SpeakerManager -from TTS.utils.audio import AudioProcessor def main(): diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 68c07eaa11..8000e783b2 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -285,10 +285,9 @@ def brspeech(root_path, meta_file, ignored_speakers=None): return items -def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic2"): +def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic2", ignored_speakers=None): """https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip""" file_ext = "flac" - test_speakers = meta_files items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 190c699e01..da7fca17c3 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -123,7 +123,9 @@ def init_multispeaker(self, config: Coqpit): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) if self.speaker_manager is not None: - assert config.d_vector_dim == self.speaker_manager.d_vector_dim, " [!] d-vector dimension mismatch b/w config and speaker manager." + assert ( + config.d_vector_dim == self.speaker_manager.d_vector_dim + ), " [!] d-vector dimension mismatch b/w config and speaker manager." # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: print(" > Init speaker_embedding layer.") diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 83d2f9f92d..df8abd8e16 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1,5 +1,4 @@ import math -import random from dataclasses import dataclass, field, replace from itertools import chain from typing import Dict, List, Tuple, Union @@ -266,10 +265,20 @@ class Vits(BaseTTS): Check :class:`TTS.tts.configs.vits_config.VitsConfig` for class arguments. Examples: + Init only model layers. + >>> from TTS.tts.configs.vits_config import VitsConfig >>> from TTS.tts.models.vits import Vits >>> config = VitsConfig() >>> model = Vits(config) + + Fully init a model ready for action. All the class attributes and class members + (e.g Tokenizer, AudioProcessor, etc.). are initialized internally based on config values. + + >>> from TTS.tts.configs.vits_config import VitsConfig + >>> from TTS.tts.models.vits import Vits + >>> config = VitsConfig() + >>> model = Vits.init_from_config(config) """ # pylint: disable=dangerous-default-value @@ -903,13 +912,10 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: aux_inputs["text"], self.config, "cuda" in str(next(self.parameters()).device), - self.ap, speaker_id=aux_inputs["speaker_id"], d_vector=aux_inputs["d_vector"], style_wav=aux_inputs["style_wav"], language_id=aux_inputs["language_id"], - language_name=aux_inputs["language_name"], - enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, ).values() diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 72cd8403a8..6fed838205 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -181,7 +181,6 @@ def synthesis( do_trim_silence=False, d_vector=None, language_id=None, - language_name=None, backend="torch", ): """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to @@ -218,9 +217,6 @@ def synthesis( language_id (int): Language ID passed to the language embedding layer in multi-langual model. Defaults to None. - language_name (str): - Language name corresponding to the language code used by the phonemizer. Defaults to None. - backend (str): tf or torch. Defaults to "torch". """ diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index f84a51eed8..80be368d48 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -148,7 +148,7 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): # init cleaners text_cleaner = None if isinstance(config.text_cleaner, (str, list)): - text_cleaner = getattr(config, "text_cleaner") + text_cleaner = getattr(cleaners, config.text_cleaner) # init characters if characters is None: diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index f6a1ae6ab1..a1a323e819 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -122,13 +122,9 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) - speaker_manager = self._init_speaker_encoder(speaker_manager) if language_manager is not None: - self.tts_model = setup_tts_model( - config=self.tts_config, - speaker_manager=speaker_manager, - language_manager=language_manager, - ) + self.tts_model = setup_tts_model(config=self.tts_config) else: - self.tts_model = setup_tts_model(config=self.tts_config, speaker_manager=speaker_manager) + self.tts_model = setup_tts_model(config=self.tts_config) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() @@ -333,7 +329,6 @@ def tts( use_cuda=self.use_cuda, speaker_id=speaker_id, language_id=language_id, - language_name=language_name, style_wav=style_wav, use_griffin_lim=use_gl, d_vector=speaker_embedding, diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index e97b793a67..e48977e9d4 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -1,8 +1,6 @@ import copy import os import unittest -from TTS.tts.utils.speakers import SpeakerManager -from TTS.utils.logging.tensorboard_logger import TensorboardLogger import torch from torch import optim @@ -11,7 +9,9 @@ from TTS.tts.configs.glow_tts_config import GlowTTSConfig from TTS.tts.layers.losses import GlowTTSLoss from TTS.tts.models.glow_tts import GlowTTS +from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor +from TTS.utils.logging.tensorboard_logger import TensorboardLogger # pylint: disable=unused-variable @@ -31,7 +31,8 @@ def count_parameters(model): class TestGlowTTS(unittest.TestCase): - def _create_inputs(self): + @staticmethod + def _create_inputs(): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 @@ -40,7 +41,8 @@ def _create_inputs(self): speaker_ids = torch.randint(0, 5, (8,)).long().to(device) return input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids - def _check_parameter_changes(self, model, model_ref): + @staticmethod + def _check_parameter_changes(model, model_ref): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( @@ -166,7 +168,7 @@ def test_forward_with_speaker_id(self): def _assert_inference_outputs(self, outputs, input_dummy, mel_spec): output_shape = outputs["model_outputs"].shape - self.assertEqual(outputs["model_outputs"].shape[::2] , mel_spec.shape[::2]) + self.assertEqual(outputs["model_outputs"].shape[::2], mel_spec.shape[::2]) self.assertEqual(outputs["logdet"], None) self.assertEqual(outputs["y_mean"].shape, output_shape) self.assertEqual(outputs["y_log_scale"].shape, output_shape) @@ -185,7 +187,12 @@ def test_inference(self): def test_inference_with_d_vector(self): input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() d_vector = torch.rand(8, 256).to(device) - config = GlowTTSConfig(num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json")) + config = GlowTTSConfig( + num_chars=32, + use_d_vector_file=True, + d_vector_dim=256, + d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + ) model = GlowTTS.init_from_config(config, verbose=False).to(device) model.eval() outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "d_vectors": d_vector}) @@ -268,7 +275,9 @@ def test_train_eval_log(self): model = GlowTTS.init_from_config(config, verbose=False).to(device) model.run_data_dep_init = False model.train() - logger = TensorboardLogger(log_dir=os.path.join(get_tests_output_path(), "dummy_glow_tts_logs"), model_name = "glow_tts_test_train_log") + logger = TensorboardLogger( + log_dir=os.path.join(get_tests_output_path(), "dummy_glow_tts_logs"), model_name="glow_tts_test_train_log" + ) criterion = model.get_criterion() outputs, _ = model.train_step(batch, criterion) model.train_log(batch, outputs, logger, None, 1) @@ -316,14 +325,23 @@ def test_init_from_config(self): self.assertTrue(model.num_speakers == 2) self.assertTrue(hasattr(model, "emb_g")) - config = GlowTTSConfig(num_chars=32, num_speakers=2, use_speaker_embedding=True, speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json")) + config = GlowTTSConfig( + num_chars=32, + num_speakers=2, + use_speaker_embedding=True, + speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"), + ) model = GlowTTS.init_from_config(config, verbose=False).to(device) self.assertTrue(model.num_speakers == 10) self.assertTrue(hasattr(model, "emb_g")) - config = GlowTTSConfig(num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json")) + config = GlowTTSConfig( + num_chars=32, + use_d_vector_file=True, + d_vector_dim=256, + d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + ) model = GlowTTS.init_from_config(config, verbose=False).to(device) self.assertTrue(model.num_speakers == 1) self.assertTrue(not hasattr(model, "emb_g")) self.assertTrue(model.c_in_channels == config.d_vector_dim) - From 26be609cfaaa83250100b62001c9efe9d05ccb58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 13 Jan 2022 17:39:06 +0000 Subject: [PATCH 45/67] Extend unittests --- TTS/tts/layers/vits/networks.py | 9 +- TTS/tts/models/vits.py | 46 ++++- tests/tts_tests/test_glow_tts.py | 89 ++++++---- tests/tts_tests/test_vits.py | 285 +++++++++++++++++++++++++++---- 4 files changed, 361 insertions(+), 68 deletions(-) diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py index ef426ace5c..b6497c78fa 100644 --- a/TTS/tts/layers/vits/networks.py +++ b/TTS/tts/layers/vits/networks.py @@ -83,6 +83,7 @@ def forward(self, x, x_lengths, lang_emb=None): - x: :math:`[B, T]` - x_length: :math:`[B]` """ + assert x.shape[0] == x_lengths.shape[0] x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] # concat the lang emb in embedding chars @@ -90,7 +91,7 @@ def forward(self, x, x_lengths, lang_emb=None): x = torch.cat((x, lang_emb.transpose(2, 1).expand(x.size(0), x.size(1), -1)), dim=-1) x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) # [b, 1, t] x = self.encoder(x * x_mask, x_mask) stats = self.proj(x) * x_mask @@ -136,6 +137,9 @@ def __init__( def forward(self, x, x_mask, g=None, reverse=False): """ + Note: + Set `reverse` to True for inference. + Shapes: - x: :math:`[B, C, T]` - x_mask: :math:`[B, 1, T]` @@ -209,6 +213,9 @@ def __init__( def forward(self, x, x_mask, g=None, reverse=False): """ + Note: + Set `reverse` to True for inference. + Shapes: - x: :math:`[B, C, T]` - x_mask: :math:`[B, 1, T]` diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index df8abd8e16..34cb69c8a7 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -568,6 +568,19 @@ def forward( - d_vectors: :math:`[B, C, 1]` - speaker_ids: :math:`[B]` - language_ids: :math:`[B]` + + Return Shapes: + - model_outputs: :math:`[B, 1, T_wav]` + - alignments: :math:`[B, T_seq, T_dec]` + - z: :math:`[B, C, T_dec]` + - z_p: :math:`[B, C, T_dec]` + - m_p: :math:`[B, C, T_dec]` + - logs_p: :math:`[B, C, T_dec]` + - m_q: :math:`[B, C, T_dec]` + - logs_q: :math:`[B, C, T_dec]` + - waveform_seg: :math:`[B, 1, spec_seg_size * hop_length]` + - gt_spk_emb: :math:`[B, 1, speaker_encoder.proj_dim]` + - syn_spk_emb: :math:`[B, 1, speaker_encoder.proj_dim]` """ outputs = {} sid, g, lid = self._set_cond_input(aux_input) @@ -668,15 +681,33 @@ def forward( ) return outputs - def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}): + @staticmethod + def _set_x_lengths(x, aux_input): + if "x_lengths" in aux_input and aux_input["x_lengths"] is not None: + return aux_input["x_lengths"] + return torch.tensor(x.shape[1:2]).to(x.device) + + def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None}): """ + Note: + To run in batch mode, provide `x_lengths` else model assumes that the batch size is 1. + Shapes: - x: :math:`[B, T_seq]` - - d_vectors: :math:`[B, C, 1]` + - x_lengths: :math:`[B]` + - d_vectors: :math:`[B, C]` - speaker_ids: :math:`[B]` + + Return Shapes: + - model_outputs: :math:`[B, 1, T_wav]` + - alignments: :math:`[B, T_seq, T_dec]` + - z: :math:`[B, C, T_dec]` + - z_p: :math:`[B, C, T_dec]` + - m_p: :math:`[B, C, T_dec]` + - logs_p: :math:`[B, C, T_dec]` """ sid, g, lid = self._set_cond_input(aux_input) - x_lengths = torch.tensor(x.shape[1:2]).to(x.device) + x_lengths = self._set_x_lengths(x, aux_input) # speaker embedding if self.args.use_speaker_embedding and sid is not None: @@ -699,8 +730,9 @@ def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "langu w = torch.exp(logw) * x_mask * self.length_scale w_ceil = torch.ceil(w) y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() - y_mask = sequence_mask(y_lengths, None).to(x_mask.dtype) - attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) + y_mask = sequence_mask(y_lengths, None).to(x_mask.dtype).unsqueeze(1) # [B, 1, T_dec] + + attn_mask = x_mask * y_mask.transpose(1, 2) # [B, 1, T_enc] * [B, T_dec, 1] attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1).transpose(1, 2)) m_p = torch.matmul(attn.transpose(1, 2), m_p.transpose(1, 2)).transpose(1, 2) @@ -999,7 +1031,7 @@ def load_checkpoint( assert not self.training @staticmethod - def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): """Initiate model from config Args: @@ -1009,7 +1041,7 @@ def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict] """ from TTS.utils.audio import AudioProcessor - ap = AudioProcessor.init_from_config(config) + ap = AudioProcessor.init_from_config(config, verbose=verbose) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) language_manager = LanguageManager.init_from_config(config) diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index e48977e9d4..305f86b896 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -23,6 +23,7 @@ ap = AudioProcessor(**c.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") +BATCH_SIZE = 3 def count_parameters(model): @@ -32,13 +33,13 @@ def count_parameters(model): class TestGlowTTS(unittest.TestCase): @staticmethod - def _create_inputs(): - input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 129, (8,)).long().to(device) + def _create_inputs(batch_size=8): + input_dummy = torch.randint(0, 24, (batch_size, 128)).long().to(device) + input_lengths = torch.randint(100, 129, (batch_size,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_lengths = torch.randint(20, 30, (8,)).long().to(device) - speaker_ids = torch.randint(0, 5, (8,)).long().to(device) + mel_spec = torch.rand(batch_size, 30, c.audio["num_mels"]).to(device) + mel_lengths = torch.randint(20, 30, (batch_size,)).long().to(device) + speaker_ids = torch.randint(0, 5, (batch_size,)).long().to(device) return input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids @staticmethod @@ -104,8 +105,8 @@ def test_lock_act_norm_layers(self): if getattr(f, "set_ddi", False): self.assertTrue(f.initialized) - def test_forward(self): - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + def _test_forward(self, batch_size): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size) # create model config = GlowTTSConfig(num_chars=32) model = GlowTTS(config).to(device) @@ -114,16 +115,20 @@ def test_forward(self): # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths) self.assertEqual(y["z"].shape, mel_spec.shape) - self.assertEqual(y["logdet"].shape, torch.Size([8])) + self.assertEqual(y["logdet"].shape, torch.Size([batch_size])) self.assertEqual(y["y_mean"].shape, mel_spec.shape) self.assertEqual(y["y_log_scale"].shape, mel_spec.shape) self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],)) self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,)) self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,)) - def test_forward_with_d_vector(self): - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() - d_vector = torch.rand(8, 256).to(device) + def test_forward(self): + self._test_forward(1) + self._test_forward(3) + + def _test_forward_with_d_vector(self, batch_size): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size) + d_vector = torch.rand(batch_size, 256).to(device) # create model config = GlowTTSConfig( num_chars=32, @@ -137,16 +142,20 @@ def test_forward_with_d_vector(self): # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector}) self.assertEqual(y["z"].shape, mel_spec.shape) - self.assertEqual(y["logdet"].shape, torch.Size([8])) + self.assertEqual(y["logdet"].shape, torch.Size([batch_size])) self.assertEqual(y["y_mean"].shape, mel_spec.shape) self.assertEqual(y["y_log_scale"].shape, mel_spec.shape) self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],)) self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,)) self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,)) - def test_forward_with_speaker_id(self): - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() - speaker_ids = torch.randint(0, 24, (8,)).long().to(device) + def test_forward_with_d_vector(self): + self._test_forward_with_d_vector(1) + self._test_forward_with_d_vector(3) + + def _test_forward_with_speaker_id(self, batch_size): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size) + speaker_ids = torch.randint(0, 24, (batch_size,)).long().to(device) # create model config = GlowTTSConfig( num_chars=32, @@ -159,13 +168,17 @@ def test_forward_with_speaker_id(self): # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"speaker_ids": speaker_ids}) self.assertEqual(y["z"].shape, mel_spec.shape) - self.assertEqual(y["logdet"].shape, torch.Size([8])) + self.assertEqual(y["logdet"].shape, torch.Size([batch_size])) self.assertEqual(y["y_mean"].shape, mel_spec.shape) self.assertEqual(y["y_log_scale"].shape, mel_spec.shape) self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1],)) self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1,)) self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1,)) + def test_forward_with_speaker_id(self): + self._test_forward_with_speaker_id(1) + self._test_forward_with_speaker_id(3) + def _assert_inference_outputs(self, outputs, input_dummy, mel_spec): output_shape = outputs["model_outputs"].shape self.assertEqual(outputs["model_outputs"].shape[::2], mel_spec.shape[::2]) @@ -176,17 +189,21 @@ def _assert_inference_outputs(self, outputs, input_dummy, mel_spec): self.assertEqual(outputs["durations_log"].shape, input_dummy.shape + (1,)) self.assertEqual(outputs["total_durations_log"].shape, input_dummy.shape + (1,)) - def test_inference(self): - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + def _test_inference(self, batch_size): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size) config = GlowTTSConfig(num_chars=32) model = GlowTTS(config).to(device) model.eval() outputs = model.inference(input_dummy, {"x_lengths": input_lengths}) self._assert_inference_outputs(outputs, input_dummy, mel_spec) - def test_inference_with_d_vector(self): - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() - d_vector = torch.rand(8, 256).to(device) + def test_inference(self): + self._test_inference(1) + self._test_inference(3) + + def _test_inference_with_d_vector(self, batch_size): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size) + d_vector = torch.rand(batch_size, 256).to(device) config = GlowTTSConfig( num_chars=32, use_d_vector_file=True, @@ -198,9 +215,13 @@ def test_inference_with_d_vector(self): outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "d_vectors": d_vector}) self._assert_inference_outputs(outputs, input_dummy, mel_spec) - def test_inference_with_speaker_ids(self): - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() - speaker_ids = torch.randint(0, 24, (8,)).long().to(device) + def test_inference_with_d_vector(self): + self._test_inference_with_d_vector(1) + self._test_inference_with_d_vector(3) + + def _test_inference_with_speaker_ids(self, batch_size): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size) + speaker_ids = torch.randint(0, 24, (batch_size,)).long().to(device) # create model config = GlowTTSConfig( num_chars=32, @@ -211,8 +232,12 @@ def test_inference_with_speaker_ids(self): outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "speaker_ids": speaker_ids}) self._assert_inference_outputs(outputs, input_dummy, mel_spec) - def test_inference_with_MAS(self): - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + def test_inference_with_speaker_ids(self): + self._test_inference_with_speaker_ids(1) + self._test_inference_with_speaker_ids(3) + + def _test_inference_with_MAS(self, batch_size): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size) # create model config = GlowTTSConfig(num_chars=32) model = GlowTTS(config).to(device) @@ -226,8 +251,13 @@ def test_inference_with_MAS(self): y["model_outputs"].shape, y2["model_outputs"].shape ) + def test_inference_with_MAS(self): + self._test_inference_with_MAS(1) + self._test_inference_with_MAS(3) + def test_train_step(self): - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs() + batch_size = BATCH_SIZE + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(batch_size) criterion = GlowTTSLoss() # model to train config = GlowTTSConfig(num_chars=32) @@ -263,7 +293,8 @@ def test_train_step(self): self._check_parameter_changes(model, model_ref) def test_train_eval_log(self): - input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs() + batch_size = BATCH_SIZE + input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs(batch_size) batch = {} batch["text_input"] = input_dummy batch["text_lengths"] = input_lengths diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 4274d9479b..53e7c09e98 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -1,9 +1,11 @@ +import copy import os import unittest +from TTS.utils.logging.tensorboard_logger import TensorboardLogger import torch -from tests import assertHasAttr, assertHasNotAttr, get_tests_input_path +from tests import assertHasAttr, assertHasNotAttr, get_tests_data_path, get_tests_input_path, get_tests_output_path from TTS.config import load_config from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model from TTS.tts.configs.vits_config import VitsConfig @@ -100,35 +102,35 @@ def test_voice_conversion(self): self.assertEqual(z_p.shape, (1, args.hidden_channels, spec_len)) self.assertEqual(z_hat.shape, (1, args.hidden_channels, spec_len)) - def _init_inputs(self, config): - input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 129, (8,)).long().to(device) + def _create_inputs(self, config, batch_size=2): + input_dummy = torch.randint(0, 24, (batch_size, 128)).long().to(device) + input_lengths = torch.randint(100, 129, (batch_size,)).long().to(device) input_lengths[-1] = 128 - spec = torch.rand(8, config.audio["fft_size"] // 2 + 1, 30).to(device) - spec_lengths = torch.randint(20, 30, (8,)).long().to(device) + spec = torch.rand(batch_size, config.audio["fft_size"] // 2 + 1, 30).to(device) + spec_lengths = torch.randint(20, 30, (batch_size,)).long().to(device) spec_lengths[-1] = spec.size(2) - waveform = torch.rand(8, 1, spec.size(2) * config.audio["hop_length"]).to(device) + waveform = torch.rand(batch_size, 1, spec.size(2) * config.audio["hop_length"]).to(device) return input_dummy, input_lengths, spec, spec_lengths, waveform - def _check_forward_outputs(self, config, output_dict, encoder_config=None): + def _check_forward_outputs(self, config, output_dict, encoder_config=None, batch_size=2): self.assertEqual( output_dict["model_outputs"].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"] ) - self.assertEqual(output_dict["alignments"].shape, (8, 128, 30)) + self.assertEqual(output_dict["alignments"].shape, (batch_size, 128, 30)) self.assertEqual(output_dict["alignments"].max(), 1) self.assertEqual(output_dict["alignments"].min(), 0) - self.assertEqual(output_dict["z"].shape, (8, config.model_args.hidden_channels, 30)) - self.assertEqual(output_dict["z_p"].shape, (8, config.model_args.hidden_channels, 30)) - self.assertEqual(output_dict["m_p"].shape, (8, config.model_args.hidden_channels, 30)) - self.assertEqual(output_dict["logs_p"].shape, (8, config.model_args.hidden_channels, 30)) - self.assertEqual(output_dict["m_q"].shape, (8, config.model_args.hidden_channels, 30)) - self.assertEqual(output_dict["logs_q"].shape, (8, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["z"].shape, (batch_size, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["z_p"].shape, (batch_size, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["m_p"].shape, (batch_size, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["logs_p"].shape, (batch_size, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["m_q"].shape, (batch_size, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["logs_q"].shape, (batch_size, config.model_args.hidden_channels, 30)) self.assertEqual( output_dict["waveform_seg"].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"] ) if encoder_config: - self.assertEqual(output_dict["gt_spk_emb"].shape, (8, encoder_config.model_params["proj_dim"])) - self.assertEqual(output_dict["syn_spk_emb"].shape, (8, encoder_config.model_params["proj_dim"])) + self.assertEqual(output_dict["gt_spk_emb"].shape, (batch_size, encoder_config.model_params["proj_dim"])) + self.assertEqual(output_dict["syn_spk_emb"].shape, (batch_size, encoder_config.model_params["proj_dim"])) else: self.assertEqual(output_dict["gt_spk_emb"], None) self.assertEqual(output_dict["syn_spk_emb"], None) @@ -137,7 +139,7 @@ def test_forward(self): num_speakers = 0 config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True) config.model_args.spec_segment_size = 10 - input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config) + input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config) model = Vits(config).to(device) output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform) self._check_forward_outputs(config, output_dict) @@ -148,7 +150,7 @@ def test_multispeaker_forward(self): config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True) config.model_args.spec_segment_size = 10 - input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config) + input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config) speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device) model = Vits(config).to(device) @@ -157,16 +159,36 @@ def test_multispeaker_forward(self): ) self._check_forward_outputs(config, output_dict) + def test_d_vector_forward(self): + batch_size = 2 + args = VitsArgs( + spec_segment_size=10, + num_chars=32, + use_d_vector_file=True, + d_vector_dim=256, + d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + ) + config = VitsConfig(model_args=args) + model = Vits.init_from_config(config, verbose=False).to(device) + model.train() + input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config, batch_size=batch_size) + d_vectors = torch.randn(batch_size, 256).to(device) + output_dict = model.forward( + input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"d_vectors": d_vectors} + ) + self._check_forward_outputs(config, output_dict) + def test_multilingual_forward(self): num_speakers = 10 num_langs = 3 + batch_size = 2 args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, spec_segment_size=10) config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args) - input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config) - speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device) - lang_ids = torch.randint(0, num_langs, (8,)).long().to(device) + input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config, batch_size=batch_size) + speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device) + lang_ids = torch.randint(0, num_langs, (batch_size,)).long().to(device) model = Vits(config).to(device) output_dict = model.forward( @@ -182,6 +204,7 @@ def test_multilingual_forward(self): def test_secl_forward(self): num_speakers = 10 num_langs = 3 + batch_size = 2 speaker_encoder_config = load_config(SPEAKER_ENCODER_CONFIG) speaker_encoder_config.model_params["use_torch_spec"] = True @@ -198,9 +221,9 @@ def test_secl_forward(self): config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args) config.audio.sample_rate = 16000 - input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config) - speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device) - lang_ids = torch.randint(0, num_langs, (8,)).long().to(device) + input_dummy, input_lengths, spec, spec_lengths, waveform = self._create_inputs(config, batch_size=batch_size) + speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device) + lang_ids = torch.randint(0, num_langs, (batch_size,)).long().to(device) model = Vits(config, speaker_manager=speaker_manager).to(device) output_dict = model.forward( @@ -213,28 +236,228 @@ def test_secl_forward(self): ) self._check_forward_outputs(config, output_dict, speaker_encoder_config) + def _check_inference_outputs(self, config, outputs, input_dummy, batch_size=1): + feat_len = outputs["z"].shape[2] + self.assertEqual(outputs["model_outputs"].shape[:2], (batch_size, 1)) # we don't know the channel dimension + self.assertEqual(outputs["alignments"].shape, (batch_size, input_dummy.shape[1], feat_len)) + self.assertEqual(outputs["z"].shape, (batch_size, config.model_args.hidden_channels, feat_len)) + self.assertEqual(outputs["z_p"].shape, (batch_size, config.model_args.hidden_channels, feat_len)) + self.assertEqual(outputs["m_p"].shape, (batch_size, config.model_args.hidden_channels, feat_len)) + self.assertEqual(outputs["logs_p"].shape, (batch_size, config.model_args.hidden_channels, feat_len)) + def test_inference(self): num_speakers = 0 config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True) - input_dummy = torch.randint(0, 24, (1, 128)).long().to(device) model = Vits(config).to(device) - _ = model.inference(input_dummy) + + batch_size = 1 + input_dummy, *_ = self._create_inputs(config, batch_size=batch_size) + outputs = model.inference(input_dummy) + self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size) + + batch_size = 2 + input_dummy, input_lengths, *_ = self._create_inputs(config, batch_size=batch_size) + outputs = model.inference(input_dummy, aux_input={"x_lengths": input_lengths}) + self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size) def test_multispeaker_inference(self): num_speakers = 10 config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True) - input_dummy = torch.randint(0, 24, (1, 128)).long().to(device) - speaker_ids = torch.randint(0, num_speakers, (1,)).long().to(device) model = Vits(config).to(device) - _ = model.inference(input_dummy, {"speaker_ids": speaker_ids}) + + batch_size = 1 + input_dummy, *_ = self._create_inputs(config, batch_size=batch_size) + speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device) + outputs = model.inference(input_dummy, {"speaker_ids": speaker_ids}) + self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size) + + batch_size = 2 + input_dummy, input_lengths, *_ = self._create_inputs(config, batch_size=batch_size) + speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device) + outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "speaker_ids": speaker_ids}) + self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size) def test_multilingual_inference(self): num_speakers = 10 num_langs = 3 args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, spec_segment_size=10) config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args) + model = Vits(config).to(device) + input_dummy = torch.randint(0, 24, (1, 128)).long().to(device) speaker_ids = torch.randint(0, num_speakers, (1,)).long().to(device) lang_ids = torch.randint(0, num_langs, (1,)).long().to(device) - model = Vits(config).to(device) _ = model.inference(input_dummy, {"speaker_ids": speaker_ids, "language_ids": lang_ids}) + + batch_size = 1 + input_dummy, *_ = self._create_inputs(config, batch_size=batch_size) + speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device) + lang_ids = torch.randint(0, num_langs, (batch_size,)).long().to(device) + outputs = model.inference(input_dummy, {"speaker_ids": speaker_ids, "language_ids": lang_ids}) + self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size) + + batch_size = 2 + input_dummy, input_lengths, *_ = self._create_inputs(config, batch_size=batch_size) + speaker_ids = torch.randint(0, num_speakers, (batch_size,)).long().to(device) + lang_ids = torch.randint(0, num_langs, (batch_size,)).long().to(device) + outputs = model.inference( + input_dummy, {"x_lengths": input_lengths, "speaker_ids": speaker_ids, "language_ids": lang_ids} + ) + self._check_inference_outputs(config, outputs, input_dummy, batch_size=batch_size) + + def test_d_vector_inference(self): + args = VitsArgs( + spec_segment_size=10, + num_chars=32, + use_d_vector_file=True, + d_vector_dim=256, + d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + ) + config = VitsConfig(model_args=args) + model = Vits.init_from_config(config, verbose=False).to(device) + model.eval() + # batch size = 1 + input_dummy = torch.randint(0, 24, (1, 128)).long().to(device) + d_vectors = torch.randn(1, 256).to(device) + outputs = model.inference(input_dummy, aux_input={"d_vectors": d_vectors}) + self._check_inference_outputs(config, outputs, input_dummy) + # batch size = 2 + input_dummy, input_lengths, *_ = self._create_inputs(config) + d_vectors = torch.randn(2, 256).to(device) + outputs = model.inference(input_dummy, aux_input={"x_lengths": input_lengths, "d_vectors": d_vectors}) + self._check_inference_outputs(config, outputs, input_dummy, batch_size=2) + + @staticmethod + def _check_parameter_changes(model, model_ref): + count = 0 + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) + count += 1 + + def _create_batch(self, config, batch_size): + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(config, batch_size) + batch = {} + batch["text_input"] = input_dummy + batch["text_lengths"] = input_lengths + batch["mel_lengths"] = mel_lengths + batch["linear_input"] = mel_spec.transpose(1, 2) + batch["waveform"] = torch.rand(batch_size, config.audio["sample_rate"] * 10, 1).to(device) + batch["d_vectors"] = None + batch["speaker_ids"] = None + batch["language_ids"] = None + return batch + + def test_train_step(self): + # setup the model + config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10)) + model = Vits(config).to(device) + # create a batch + batch = self._create_batch(config, 1) + # model to train + criterions = model.get_criterion() + criterions = [criterions[0].to(device), criterions[1].to(device)] + # reference model to compare model weights + model_ref = Vits(config).to(device) + model.train() + # pass the state to ref model + model_ref.load_state_dict(copy.deepcopy(model.state_dict())) + count = 0 + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizers = model.get_optimizer() + for _ in range(5): + _, loss_dict = model.train_step(batch, criterions, 0) + loss = loss_dict["loss"] + loss.backward() + optimizers[0].step() + + _, loss_dict = model.train_step(batch, criterions, 1) + loss = loss_dict["loss"] + loss.backward() + optimizers[1].step() + # check parameter changes + self._check_parameter_changes(model, model_ref) + + def test_train_eval_log(self): + batch_size = 2 + config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10)) + model = Vits.init_from_config(config, verbose=False).to(device) + model.run_data_dep_init = False + model.train() + batch = self._create_batch(config, batch_size) + logger = TensorboardLogger( + log_dir=os.path.join(get_tests_output_path(), "dummy_vits_logs"), model_name="vits_test_train_log" + ) + criterion = model.get_criterion() + criterion = [criterion[0].to(device), criterion[1].to(device)] + outputs = [None] * 2 + outputs[0], _ = model.train_step(batch, criterion, 0) + outputs[1], _ = model.train_step(batch, criterion, 1) + model.train_log(batch, outputs, logger, None, 1) + + model.eval_log(batch, outputs, logger, None, 1) + logger.finish() + + def test_test_run(self): + config = VitsConfig(model_args=VitsArgs(num_chars=32)) + model = Vits.init_from_config(config, verbose=False).to(device) + model.run_data_dep_init = False + model.eval() + test_figures, test_audios = model.test_run(None) + self.assertTrue(test_figures is not None) + self.assertTrue(test_audios is not None) + + def test_load_checkpoint(self): + chkp_path = os.path.join(get_tests_output_path(), "dummy_glow_tts_checkpoint.pth") + config = VitsConfig(VitsArgs(num_chars=32)) + model = Vits.init_from_config(config, verbose=False).to(device) + chkp = {} + chkp["model"] = model.state_dict() + torch.save(chkp, chkp_path) + model.load_checkpoint(config, chkp_path) + self.assertTrue(model.training) + model.load_checkpoint(config, chkp_path, eval=True) + self.assertFalse(model.training) + + def test_get_criterion(self): + config = VitsConfig(VitsArgs(num_chars=32)) + model = Vits.init_from_config(config, verbose=False).to(device) + criterion = model.get_criterion() + self.assertTrue(criterion is not None) + + def test_init_from_config(self): + config = VitsConfig(model_args=VitsArgs(num_chars=32)) + model = Vits.init_from_config(config, verbose=False).to(device) + + config = VitsConfig(model_args=VitsArgs(num_chars=32, num_speakers=2)) + model = Vits.init_from_config(config, verbose=False).to(device) + self.assertTrue(not hasattr(model, "emb_g")) + + config = VitsConfig(model_args=VitsArgs(num_chars=32, num_speakers=2, use_speaker_embedding=True)) + model = Vits.init_from_config(config, verbose=False).to(device) + self.assertEqual(model.num_speakers, 2) + self.assertTrue(hasattr(model, "emb_g")) + + config = VitsConfig(model_args=VitsArgs( + num_chars=32, + num_speakers=2, + use_speaker_embedding=True, + speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"), + )) + model = Vits.init_from_config(config, verbose=False).to(device) + self.assertEqual(model.num_speakers, 10) + self.assertTrue(hasattr(model, "emb_g")) + + config = VitsConfig(model_args=VitsArgs( + num_chars=32, + use_d_vector_file=True, + d_vector_dim=256, + d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + )) + model = Vits.init_from_config(config, verbose=False).to(device) + self.assertTrue(model.num_speakers == 1) + self.assertTrue(not hasattr(model, "emb_g")) + self.assertTrue(model.embedded_speaker_dim == config.d_vector_dim) From 4b612d71a392212bd31ac428b716873060c8950f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 13 Jan 2022 17:43:05 +0000 Subject: [PATCH 46/67] Make lint --- tests/tts_tests/test_vits.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 53e7c09e98..eaa325b002 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -1,7 +1,6 @@ import copy import os import unittest -from TTS.utils.logging.tensorboard_logger import TensorboardLogger import torch @@ -11,6 +10,7 @@ from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.models.vits import Vits, VitsArgs from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.logging.tensorboard_logger import TensorboardLogger LANG_FILE = os.path.join(get_tests_input_path(), "language_ids.json") SPEAKER_ENCODER_CONFIG = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json") @@ -337,7 +337,7 @@ def _check_parameter_changes(model, model_ref): count += 1 def _create_batch(self, config, batch_size): - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(config, batch_size) + input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs(config, batch_size) batch = {} batch["text_input"] = input_dummy batch["text_lengths"] = input_lengths @@ -441,22 +441,26 @@ def test_init_from_config(self): self.assertEqual(model.num_speakers, 2) self.assertTrue(hasattr(model, "emb_g")) - config = VitsConfig(model_args=VitsArgs( - num_chars=32, - num_speakers=2, - use_speaker_embedding=True, - speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"), - )) + config = VitsConfig( + model_args=VitsArgs( + num_chars=32, + num_speakers=2, + use_speaker_embedding=True, + speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"), + ) + ) model = Vits.init_from_config(config, verbose=False).to(device) self.assertEqual(model.num_speakers, 10) self.assertTrue(hasattr(model, "emb_g")) - config = VitsConfig(model_args=VitsArgs( - num_chars=32, - use_d_vector_file=True, - d_vector_dim=256, - d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), - )) + config = VitsConfig( + model_args=VitsArgs( + num_chars=32, + use_d_vector_file=True, + d_vector_dim=256, + d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), + ) + ) model = Vits.init_from_config(config, verbose=False).to(device) self.assertTrue(model.num_speakers == 1) self.assertTrue(not hasattr(model, "emb_g")) From 24336262f5093a76e0b2a2f0e1794bfcd40e5d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 14 Jan 2022 12:10:39 +0000 Subject: [PATCH 47/67] Fix tests --- TTS/bin/find_unique_phonemes.py | 8 +++++--- tests/aux_tests/test_find_unique_phonemes.py | 2 -- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index d3143ca324..e84c17de2f 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -7,14 +7,16 @@ from TTS.config import load_config from TTS.tts.datasets import load_tts_samples -from TTS.tts.utils.text import text2phone +from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut + + +phonemizer = Gruut(language="en-us") def compute_phonemes(item): try: text = item[0] - language = item[-1] - ph = text2phone(text, language, use_espeak_phonemes=c.use_espeak_phonemes).split("|") + ph = phonemizer.phonemize(text).split("|") except: return [] return list(set(ph)) diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py index fa0abe4b92..fa740ba361 100644 --- a/tests/aux_tests/test_find_unique_phonemes.py +++ b/tests/aux_tests/test_find_unique_phonemes.py @@ -39,7 +39,6 @@ def test_espeak_phonemes(): num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=True, phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, @@ -64,7 +63,6 @@ def test_no_espeak_phonemes(): num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, From 911b2dbab95b5af640d6ebdf2131b8df6bc12e27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 14 Jan 2022 12:10:54 +0000 Subject: [PATCH 48/67] Fix docstring --- TTS/tts/datasets/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 40eed7e365..07f3d99ce8 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -13,7 +13,7 @@ def split_dataset(items): """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training. Args: - items (List[List]): A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`. + items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`. """ speakers = [item[-1] for item in items] is_multi_speaker = len(set(speakers)) > 1 @@ -52,7 +52,7 @@ def load_tts_samples( formatter (Callable, optional): The preprocessing function to be applied to create the list of samples. It must take the root_path and the meta_file name and return a list of samples in the format of - `[[audio_path, text, speaker_id], ...]]`. See the available formatters in `TTS.tts.dataset.formatter` as + `[[text, audio_path, speaker_id], ...]]`. See the available formatters in `TTS.tts.dataset.formatter` as example. Defaults to None. Returns: From 2472d43124f44ac6bed14072e673ba1f5d1921c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 21 Jan 2022 15:27:41 +0000 Subject: [PATCH 49/67] Allow padding for shorter segments --- TTS/tts/utils/helpers.py | 37 ++++++++++++++++++++++++++------- tests/tts_tests/test_helpers.py | 30 +++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index b0a010b0b1..3251337768 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -57,40 +57,61 @@ def sequence_mask(sequence_length, max_len=None): return mask -def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4): +def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_short=False): """Segment each sample in a batch based on the provided segment indices Args: x (torch.tensor): Input tensor. segment_indices (torch.tensor): Segment indices. segment_size (int): Expected output segment size. + pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size. """ + # pad the input tensor if it is shorter than the segment size + if pad_short and x.shape[-1] < segment_size: + x = torch.nn.functional.pad(x, (0, segment_size - x.size(2))) + segments = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): index_start = segment_indices[i] index_end = index_start + segment_size - segments[i] = x[i, :, index_start:index_end] + x_i = x[i] + if pad_short and index_end > x.size(2): + # pad the sample if it is shorter than the segment size + x_i = torch.nn.functional.pad(x_i, (0, (index_end + 1) - x.size(2))) + segments[i] = x_i[:, index_start:index_end] return segments -def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4): +def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4, let_short_samples=False, pad_short=False): """Create random segments based on the input lengths. Args: x (torch.tensor): Input tensor. x_lengths (torch.tensor): Input lengths. segment_size (int): Expected output segment size. + let_short_samples (bool): Allow shorter samples than the segment size. + pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size. Shapes: - x: :math:`[B, C, T]` - x_lengths: :math:`[B]` """ + _x_lenghts = x_lengths.clone() B, _, T = x.size() - if x_lengths is None: - x_lengths = T - max_idxs = x_lengths - segment_size + 1 - assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size." - segment_indices = (torch.rand([B]).type_as(x) * max_idxs).long() + if pad_short: + if T < segment_size: + x = torch.nn.functional.pad(x, (0, segment_size - T)) + T = segment_size + if _x_lenghts is None: + _x_lenghts = T + len_diff = _x_lenghts - segment_size + 1 + if let_short_samples: + _x_lenghts[len_diff < 0] = segment_size + len_diff = _x_lenghts - segment_size + 1 + else: + assert all(len_diff > 0), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + segment_indices = (torch.rand([B]).type_as(x) * len_diff).long() ret = segment(x, segment_indices, segment_size) return ret, segment_indices diff --git a/tests/tts_tests/test_helpers.py b/tests/tts_tests/test_helpers.py index 6a2f260d28..708ecbf50e 100644 --- a/tests/tts_tests/test_helpers.py +++ b/tests/tts_tests/test_helpers.py @@ -1,6 +1,6 @@ import torch as T -from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask +from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask, rand_segments def average_over_durations_test(): # pylint: disable=no-self-use @@ -39,6 +39,34 @@ def segment_test(): for idx, start_indx in enumerate(segment_ids): assert x[idx, :, start_indx : start_indx + 4].sum() == segments[idx, :, :].sum() + try: + segments = segment(x, segment_ids, segment_size=10) + raise Exception("Should have failed") + except: + pass + + segments = segment(x, segment_ids, segment_size=10, pad_short=True) + for idx, start_indx in enumerate(segment_ids): + assert x[idx, :, start_indx : start_indx + 10].sum() == segments[idx, :, :].sum() + + +def rand_segments_test(): + x = T.rand(2, 3, 4) + x_lens = T.randint(3, 4, (2,)) + segments, seg_idxs = rand_segments(x, x_lens, segment_size=3) + assert segments.shape == (2, 3, 3) + assert all(seg_idxs >= 0), seg_idxs + try: + segments, _ = rand_segments(x, x_lens, segment_size=5) + raise Exception("Should have failed") + except: + pass + x_lens_back = x_lens.clone() + segments, seg_idxs= rand_segments(x, x_lens.clone(), segment_size=5, pad_short=True, let_short_samples=True) + assert segments.shape == (2, 3, 5) + assert all(seg_idxs >= 0), seg_idxs + assert all(x_lens_back == x_lens) + def generate_path_test(): durations = T.randint(1, 4, (10, 21)) From 8c555d39070cbe869b41f86e6189febc8f01c7b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 21 Jan 2022 15:29:06 +0000 Subject: [PATCH 50/67] Implement `start_by_longest` option for TTSDatase --- TTS/tts/configs/shared_configs.py | 5 +++++ TTS/tts/configs/vits_config.py | 13 +------------ TTS/tts/datasets/dataset.py | 10 ++++++++++ TTS/tts/models/base_tts.py | 1 + tests/data_tests/test_loader.py | 20 +++++++++++++++++++- 5 files changed, 36 insertions(+), 13 deletions(-) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index c7958fda00..09266ce2e7 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -172,6 +172,10 @@ class BaseTTSConfig(BaseTrainingConfig): use_noise_augment (bool): Augment the input audio with random noise. + start_by_longest (bool): + If True, the data loader will start loading the longest batch first. It is useful for checking OOM issues. + Defaults to False. + add_blank (bool): Add blank characters between each other two characters. It improves performance for some models at expense of slower run-time due to the longer input sequence. @@ -224,6 +228,7 @@ class BaseTTSConfig(BaseTrainingConfig): compute_linear_spec: bool = False precompute_num_workers: int = 0 use_noise_augment: bool = False + start_by_longest: bool = False # dataset datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 36c948afd5..d306552df3 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -67,15 +67,6 @@ class VitsConfig(BaseTTSConfig): compute_linear_spec (bool): If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`. - sort_by_audio_len (bool): - If true, dataloder sorts the data by audio length else sorts by the input text length. Defaults to `True`. - - min_seq_len (int): - Minimum sequnce length to be considered for training. Defaults to `0`. - - max_seq_len (int): - Maximum sequnce length to be considered for training. Defaults to `500000`. - r (int): Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`. @@ -123,6 +114,7 @@ class VitsConfig(BaseTTSConfig): feat_loss_alpha: float = 1.0 mel_loss_alpha: float = 45.0 dur_loss_alpha: float = 1.0 + aligner_loss_alpha = 1.0 speaker_encoder_loss_alpha: float = 1.0 # data loader params @@ -130,9 +122,6 @@ class VitsConfig(BaseTTSConfig): compute_linear_spec: bool = True # overrides - sort_by_audio_len: bool = True - min_seq_len: int = 0 - max_seq_len: int = 500000 r: int = 1 # DO NOT CHANGE add_blank: bool = True diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 5fab71088d..99d9429937 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -56,6 +56,7 @@ def __init__( d_vector_mapping: Dict = None, language_id_mapping: Dict = None, use_noise_augment: bool = False, + start_by_longest: bool = False, verbose: bool = False, ): """Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs. @@ -109,6 +110,8 @@ def __init__( use_noise_augment (bool): Enable adding random noise to wav for augmentation. Defaults to False. + start_by_longest (bool): Start by longest sequence. It is especially useful to check OOM. Defaults to False. + verbose (bool): Print diagnostic information. Defaults to false. """ super().__init__() @@ -130,6 +133,7 @@ def __init__( self.d_vector_mapping = d_vector_mapping self.language_id_mapping = language_id_mapping self.use_noise_augment = use_noise_augment + self.start_by_longest = start_by_longest self.verbose = verbose self.rescue_item_idx = 1 @@ -316,6 +320,12 @@ def preprocess_samples(self): samples, audio_lengths, _ = self.select_samples_by_idx(keep_idx) sorted_idxs = self.sort_by_length(audio_lengths) + + if self.start_by_longest: + longest_idxs = sorted_idxs[-1] + sorted_idxs[-1] = sorted_idxs[0] + sorted_idxs[0] = longest_idxs + samples, audio_lengths, text_lengtsh = self.select_samples_by_idx(sorted_idxs) if len(samples) == 0: diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 9a6a56df76..7cdfa915df 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -290,6 +290,7 @@ def get_data_loader( speaker_id_mapping=speaker_id_mapping, d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, tokenizer=self.tokenizer, + start_by_longest=config.start_by_longest, language_id_mapping=language_id_mapping, ) diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index f2f2a8d238..477ee71fb0 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -37,7 +37,7 @@ def __init__(self, *args, **kwargs): self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) - def _create_dataloader(self, batch_size, r, bgs): + def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): items = ljspeech(c.data_path, "metadata.csv") tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( @@ -52,6 +52,7 @@ def _create_dataloader(self, batch_size, r, bgs): max_text_len=c.max_text_len, min_audio_len=c.min_audio_len, max_audio_len=c.max_audio_len, + start_by_longest=start_by_longest ) dataloader = DataLoader( dataset, @@ -127,6 +128,23 @@ def test_batch_group_shuffle(self): self.assertGreaterEqual(avg_length, last_length) self.assertTrue(is_items_reordered) + def test_start_by_longest(self): + """Test start_by_longest option. + + Ther first item of the fist batch must be longer than all the other items. + """ + if ok_ljspeech: + dataloader, _ = self._create_dataloader(2, c.r, 0, True) + dataloader.dataset.preprocess_samples() + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + mel_lengths = data["mel_lengths"] + if i == 0: + max_len = mel_lengths[0] + print(mel_lengths) + self.assertTrue(all(max_len >= mel_lengths)) + def test_padding_and_spectrograms(self): def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): self.assertNotEqual(linear_input[idx, -1].sum(), 0) # check padding From c3ae11482d71572db0424112a78fce2b2e9c5d82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 21 Jan 2022 15:33:15 +0000 Subject: [PATCH 51/67] Refactor VITS model --- TTS/tts/models/vits.py | 106 ++++++++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 38 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 34cb69c8a7..301ddfcd6e 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -39,7 +39,7 @@ class VitsArgs(Coqpit): Number of characters in the vocabulary. Defaults to 100. out_channels (int): - Number of output channels. Defaults to 513. + Number of output channels of the decoder. Defaults to 513. spec_segment_size (int): Decoder input segment size. Defaults to 32 `(32 * hoplength = waveform length)`. @@ -360,6 +360,8 @@ def __init__( language_emb_dim=self.embedded_language_dim, ) + upsample_rate = math.prod(self.args.upsample_rates_decoder) + assert upsample_rate == self.config.audio.hop_length, f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {self.config.audio.hop_length}" self.waveform_decoder = HifiganGenerator( self.args.hidden_channels, 1, @@ -536,6 +538,54 @@ def get_aux_input_from_test_sentences(self, sentence_info): "language_name": language_name, } + def _set_speaker_input(self, aux_input: Dict): + d_vectors = aux_input.get("d_vectors", None) + speaker_ids = aux_input.get("speaker_ids", None) + + if d_vectors is not None and speaker_ids is not None: + raise ValueError("[!] Cannot use d-vectors and speaker-ids together.") + + if speaker_ids is not None and not hasattr(self, "emb_g"): + raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.") + + g = speaker_ids if speaker_ids is not None else d_vectors + return g + + def forward_mas(self, outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g, lang_emb): + # find the alignment path + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) + with torch.no_grad(): + o_scale = torch.exp(-2 * logs_p) + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)]) + logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p]) + logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp2 + logp3 + logp1 + logp4 + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() # [b, 1, t, t'] + + # duration predictor + attn_durations = attn.sum(3) + if self.args.use_sdp: + loss_duration = self.duration_predictor( + x.detach() if self.args.detach_dp_input else x, + x_mask, + attn_durations, + g=g.detach() if self.args.detach_dp_input and g is not None else g, + lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, + ) + loss_duration = loss_duration / torch.sum(x_mask) + else: + attn_log_durations = torch.log(attn_durations + 1e-6) * x_mask + log_durations = self.duration_predictor( + x.detach() if self.args.detach_dp_input else x, + x_mask, + g=g.detach() if self.args.detach_dp_input and g is not None else g, + lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, + ) + loss_duration = torch.sum((log_durations - attn_log_durations) ** 2, [1, 2]) / torch.sum(x_mask) + outputs["loss_duration"] = loss_duration + return outputs, attn + def forward( self, x: torch.tensor, @@ -601,51 +651,27 @@ def forward( # flow layers z_p = self.flow(z, y_mask, g=g) - # find the alignment path - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) - with torch.no_grad(): - o_scale = torch.exp(-2 * logs_p) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)]) - logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p]) - logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp = logp2 + logp3 + logp1 + logp4 - attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() - # duration predictor - attn_durations = attn.sum(3) - if self.args.use_sdp: - loss_duration = self.duration_predictor( - x.detach() if self.args.detach_dp_input else x, - x_mask, - attn_durations, - g=g.detach() if self.args.detach_dp_input and g is not None else g, - lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, - ) - loss_duration = loss_duration / torch.sum(x_mask) - else: - attn_log_durations = torch.log(attn_durations + 1e-6) * x_mask - log_durations = self.duration_predictor( - x.detach() if self.args.detach_dp_input else x, - x_mask, - g=g.detach() if self.args.detach_dp_input and g is not None else g, - lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, - ) - loss_duration = torch.sum((log_durations - attn_log_durations) ** 2, [1, 2]) / torch.sum(x_mask) - outputs["loss_duration"] = loss_duration + if self.args.use_mas: + outputs, attn = self.forward_mas(outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g=g, lang_emb=lang_emb) + elif self.args.use_aligner_network: + outputs, attn = self.forward_aligner(outputs, m_p, z_p, x_mask, y_mask, g=g, lang_emb=lang_emb) + outputs["x_lens"] = x_lengths + outputs["y_lens"] = y_lengths # expand prior m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p]) logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p]) # select a random feature segment for the waveform decoder - z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size) + z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size, let_short_samples=True, pad_short=True) o = self.waveform_decoder(z_slice, g=g) wav_seg = segment( waveform, slice_ids * self.config.audio.hop_length, self.args.spec_segment_size * self.config.audio.hop_length, + pad_short=True ) if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None: @@ -667,11 +693,11 @@ def forward( outputs.update( { "model_outputs": o, - "alignments": attn.squeeze(1), - "z": z, - "z_p": z_p, + "alignments" : attn.squeeze(1), "m_p": m_p, "logs_p": logs_p, + "z": z, + "z_p": z_p, "m_q": m_q, "logs_q": logs_q, "waveform_seg": wav_seg, @@ -914,14 +940,18 @@ def train_log( Returns: Tuple[Dict, np.ndarray]: training plots and output waveform. """ - self._log(self.ap, batch, outputs, "train") + figures, audios = self._log(self.ap, batch, outputs, "train") + logger.eval_figures(steps, figures) + logger.eval_audios(steps, audios, self.ap.sample_rate) @torch.no_grad() def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int): return self.train_step(batch, criterion, optimizer_idx) def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: - return self._log(self.ap, batch, outputs, "eval") + figures, audios = self._log(self.ap, batch, outputs, "eval") + logger.eval_figures(steps, figures) + logger.eval_audios(steps, audios, self.ap.sample_rate) @torch.no_grad() def test_run(self, assets) -> Tuple[Dict, Dict]: From c94112f63353443b0f36131dd34e3538ec6fcf2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 09:22:35 +0000 Subject: [PATCH 52/67] Update GAN model --- TTS/vocoder/models/gan.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index f78d69b86e..7e03e94f2e 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -19,7 +19,7 @@ class GAN(BaseVocoder): - def __init__(self, config: Coqpit): + def __init__(self, config: Coqpit, ap: AudioProcessor=None): """Wrap a generator and a discriminator network. It provides a compatible interface for the trainer. It also helps mixing and matching different generator and disciminator networks easily. @@ -28,6 +28,7 @@ def __init__(self, config: Coqpit): Args: config (Coqpit): Model configuration. + ap (AudioProcessor): 🐸TTS AudioProcessor instance. Defaults to None. Examples: Initializing the GAN model with HifiGAN generator and discriminator. @@ -41,6 +42,7 @@ def __init__(self, config: Coqpit): self.model_d = setup_discriminator(config) self.train_disc = False # if False, train only the generator. self.y_hat_g = None # the last generator prediction to be passed onto the discriminator + self.ap = ap def forward(self, x: torch.Tensor) -> torch.Tensor: """Run the generator's forward pass. @@ -201,10 +203,9 @@ def train_log( self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument ) -> Tuple[Dict, np.ndarray]: """Call `_log()` for training.""" - ap = assets["audio_processor"] - figures, audios = self._log("eval", ap, batch, outputs) + figures, audios = self._log("eval", self.ap, batch, outputs) logger.eval_figures(steps, figures) - logger.eval_audios(steps, audios, ap.sample_rate) + logger.eval_audios(steps, audios, self.ap.sample_rate) @torch.no_grad() def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: @@ -215,10 +216,9 @@ def eval_log( self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument ) -> Tuple[Dict, np.ndarray]: """Call `_log()` for evaluation.""" - ap = assets["audio_processor"] - figures, audios = self._log("eval", ap, batch, outputs) + figures, audios = self._log("eval", self.ap, batch, outputs) logger.eval_figures(steps, figures) - logger.eval_audios(steps, audios, ap.sample_rate) + logger.eval_audios(steps, audios, self.ap.sample_rate) def load_checkpoint( self, @@ -330,12 +330,11 @@ def get_data_loader( # pylint: disable=no-self-use Returns: DataLoader: Torch dataloader. """ - ap = assets["audio_processor"] dataset = GANDataset( - ap=ap, + ap=self.ap, items=data_items, seq_len=config.seq_len, - hop_len=ap.hop_length, + hop_len=self.ap.hop_length, pad_short=config.pad_short, conv_pad=config.conv_pad, return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False, @@ -363,5 +362,6 @@ def get_criterion(self): return [GeneratorLoss(self.config), DiscriminatorLoss(self.config)] @staticmethod - def init_from_config(config: Coqpit) -> "GAN": - return GAN(config) + def init_from_config(config: Coqpit, verbose=True) -> "GAN": + ap = AudioProcessor.init_from_config(config, verbose=verbose) + return GAN(config, ap=ap) From 2386d804f13997f2d1eb6866e03f6bd6b3a78e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 09:23:07 +0000 Subject: [PATCH 53/67] Take file extension as an argument --- TTS/vocoder/datasets/preprocess.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index d8cc350ad7..0f69b812fa 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -33,8 +33,8 @@ def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor): np.save(quant_path, quant) -def find_wav_files(data_path): - wav_paths = glob.glob(os.path.join(data_path, "**", "*.wav"), recursive=True) +def find_wav_files(data_path, file_ext="wav"): + wav_paths = glob.glob(os.path.join(data_path, "**", f"*.{file_ext}"), recursive=True) return wav_paths @@ -43,8 +43,9 @@ def find_feat_files(data_path): return feat_paths -def load_wav_data(data_path, eval_split_size): - wav_paths = find_wav_files(data_path) +def load_wav_data(data_path, eval_split_size, file_ext="wav"): + wav_paths = find_wav_files(data_path, file_ext=file_ext) + assert len(wav_paths) > 0, f" [!] {data_path} is empty." np.random.seed(0) np.random.shuffle(wav_paths) return wav_paths[:eval_split_size], wav_paths[eval_split_size:] From 269f8c6ee34834f221c0f8ea15874a822de09f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 09:25:32 +0000 Subject: [PATCH 54/67] Update synthesizer to use iinit_from_config --- TTS/utils/synthesizer.py | 52 ---------------------------------------- 1 file changed, 52 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index a1a323e819..ddc2a6a545 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -110,21 +110,12 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) - use_cuda (bool): enable/disable CUDA use. """ # pylint: disable=global-statement - self.tts_config = load_config(tts_config_path) self.use_phonemes = self.tts_config.use_phonemes self.tts_model = setup_tts_model(config=self.tts_config) - speaker_manager = self._init_speaker_manager() - language_manager = self._init_language_manager() if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config() - speaker_manager = self._init_speaker_encoder(speaker_manager) - - if language_manager is not None: - self.tts_model = setup_tts_model(config=self.tts_config) - else: - self.tts_model = setup_tts_model(config=self.tts_config) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() @@ -157,49 +148,6 @@ def _is_use_d_vector_file(self): use_d_vector_file = use_d_vector_file or config.get("use_d_vector_file", False) return use_d_vector_file - def _init_speaker_manager(self): - """Initialize the SpeakerManager""" - # setup if multi-speaker settings are in the global model config - speaker_manager = None - speakers_file = get_from_config_or_model_args_with_default(self.tts_config, "speakers_file", None) - if self._is_use_speaker_embedding(): - if self.tts_speakers_file: - speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_speakers_file) - elif speakers_file: - speaker_manager = SpeakerManager(speaker_id_file_path=speakers_file) - - if self._is_use_d_vector_file(): - d_vector_file = get_from_config_or_model_args_with_default(self.tts_config, "d_vector_file", None) - if self.tts_speakers_file: - speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_speakers_file) - elif d_vector_file: - speaker_manager = SpeakerManager(d_vectors_file_path=d_vector_file) - return speaker_manager - - def _init_speaker_encoder(self, speaker_manager): - """Initialize the SpeakerEncoder""" - if self.encoder_checkpoint: - if speaker_manager is None: - speaker_manager = SpeakerManager( - encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config - ) - else: - speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) - return speaker_manager - - def _init_language_manager(self): - """Initialize the LanguageManager""" - # setup if multi-lingual settings are in the global model config - language_manager = None - if check_config_and_model_args(self.tts_config, "use_language_embedding", True): - if self.tts_languages_file: - language_manager = LanguageManager(language_ids_file_path=self.tts_languages_file) - elif self.tts_config.get("language_ids_file", None): - language_manager = LanguageManager(language_ids_file_path=self.tts_config.language_ids_file) - else: - language_manager = LanguageManager(config=self.tts_config) - return language_manager - def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. From a27133db12d7e57433bda7b7fbdabbaa63f18458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 09:26:23 +0000 Subject: [PATCH 55/67] Add pitch_fmin pitch_fmax args to the audio --- TTS/utils/audio.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index e92acf574e..acd2cfcb86 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -239,6 +239,12 @@ class AudioProcessor(object): mel_fmax (int, optional): maximum filter frequency for computing melspectrograms. Defaults to None. + pitch_fmin (int, optional): + minimum filter frequency for computing pitch. Defaults to None. + + pitch_fmax (int, optional): + maximum filter frequency for computing pitch. Defaults to None. + spec_gain (int, optional): gain applied when converting amplitude to DB. Defaults to 20. @@ -300,6 +306,8 @@ def __init__( max_norm=None, mel_fmin=None, mel_fmax=None, + pitch_fmax=None, + pitch_fmin=None, spec_gain=20, stft_pad_mode="reflect", clip_norm=True, @@ -333,6 +341,8 @@ def __init__( self.symmetric_norm = symmetric_norm self.mel_fmin = mel_fmin or 0 self.mel_fmax = mel_fmax + self.pitch_fmin = pitch_fmin + self.pitch_fmax = pitch_fmax self.spec_gain = float(spec_gain) self.stft_pad_mode = stft_pad_mode self.max_norm = 1.0 if max_norm is None else float(max_norm) @@ -726,12 +736,12 @@ def compute_f0(self, x: np.ndarray) -> np.ndarray: >>> WAV_FILE = filename = librosa.util.example_audio_file() >>> from TTS.config import BaseAudioConfig >>> from TTS.utils.audio import AudioProcessor - >>> conf = BaseAudioConfig(mel_fmax=8000) + >>> conf = BaseAudioConfig(pitch_fmax=8000) >>> ap = AudioProcessor(**conf) >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] >>> pitch = ap.compute_f0(wav) """ - assert self.mel_fmax is not None, " [!] Set `mel_fmax` before caling `compute_f0`." + assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`." # align F0 length to the spectrogram length if len(x) % self.hop_length == 0: x = np.pad(x, (0, self.hop_length // 2), mode="reflect") @@ -739,7 +749,7 @@ def compute_f0(self, x: np.ndarray) -> np.ndarray: f0, t = pw.dio( x.astype(np.double), fs=self.sample_rate, - f0_ceil=self.mel_fmax, + f0_ceil=self.pitch_fmax, frame_period=1000 * self.hop_length / self.sample_rate, ) f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) From 2303c91f5a2634d1b7bf840ad958637345780cac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 09:26:47 +0000 Subject: [PATCH 56/67] Plot pitch over input characters --- TTS/tts/utils/visual.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/TTS/tts/utils/visual.py b/TTS/tts/utils/visual.py index de6d95c5a0..4fd1f19cb8 100644 --- a/TTS/tts/utils/visual.py +++ b/TTS/tts/utils/visual.py @@ -87,6 +87,39 @@ def plot_pitch(pitch, spectrogram, ap=None, fig_size=(30, 10), output_fig=False) return fig +def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False): + """Plot pitch curves on top of the input characters. + + Args: + pitch (np.array): Pitch values. + chars (str): Characters to place to the x-axis. + + Shapes: + pitch: :math:`(T,)` + """ + old_fig_size = plt.rcParams["figure.figsize"] + if fig_size is not None: + plt.rcParams["figure.figsize"] = fig_size + + fig, ax = plt.subplots() + + x = np.array(range(len(chars))) + my_xticks = [c for c in chars] + plt.xticks(x, my_xticks) + + ax.set_xlabel("characters") + ax.set_ylabel("freq") + + ax2 = ax.twinx() + ax2.plot(pitch, linewidth=5.0, color="red") + ax2.set_ylabel("F0") + + plt.rcParams["figure.figsize"] = old_fig_size + if not output_fig: + plt.close() + return fig + + def visualize( alignment, postnet_output, From a8352d9fa9a3724a482945c61dc77cc2c5f891b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 09:27:13 +0000 Subject: [PATCH 57/67] Update language manager --- TTS/tts/utils/languages.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 5cecbe6908..8f14d71735 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,6 +1,7 @@ import json import os from typing import Dict, List +from TTS.config import check_config_and_model_args import fsspec import numpy as np @@ -105,7 +106,12 @@ def init_from_config(config: Coqpit) -> "LanguageManager": Args: config (Coqpit): Coqpit config. """ - return LanguageManager(config=config) + language_manager = None + if check_config_and_model_args(config, "use_language_embedding", True): + if config.get("language_ids_file", None): + language_manager = LanguageManager(language_ids_file_path=config.language_ids_file) + language_manager = LanguageManager(config=config) + return language_manager def _set_file_path(path): From 839750202320184f53b5cc4023564e03270695d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 09:28:33 +0000 Subject: [PATCH 58/67] Update forwardtts --- TTS/tts/layers/losses.py | 6 +++++- TTS/tts/models/forward_tts.py | 29 ++++++++++++----------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 7de4504142..75320d1078 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -740,6 +740,7 @@ def forward( alignment_logprob=None, alignment_hard=None, alignment_soft=None, + binary_loss_weight=None ): loss = 0 return_dict = {} @@ -772,7 +773,10 @@ def forward( if self.binary_alignment_loss_alpha > 0 and alignment_hard is not None: binary_alignment_loss = self._binary_alignment_loss(alignment_hard, alignment_soft) loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss - return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss + if binary_loss_weight: + return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight + else: + return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss return_dict["loss"] = loss return return_dict diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index 699f31426c..bb8640a3dd 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -15,7 +15,7 @@ from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.tts.utils.visual import plot_alignment, plot_pitch, plot_spectrogram +from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram @dataclass @@ -186,7 +186,7 @@ def __init__( self.max_duration = self.args.max_duration self.use_aligner = self.args.use_aligner self.use_pitch = self.args.use_pitch - self.use_binary_alignment_loss = False + self.binary_loss_weight = 0.0 self.length_scale = ( float(self.args.length_scale) if isinstance(self.args.length_scale, int) else self.args.length_scale @@ -644,8 +644,9 @@ def train_step(self, batch: dict, criterion: nn.Module): pitch_target=outputs["pitch_avg_gt"] if self.use_pitch else None, input_lens=text_lengths, alignment_logprob=outputs["alignment_logprob"] if self.use_aligner else None, - alignment_soft=outputs["alignment_soft"] if self.use_binary_alignment_loss else None, - alignment_hard=outputs["alignment_mas"] if self.use_binary_alignment_loss else None, + alignment_soft=outputs["alignment_soft"], + alignment_hard=outputs["alignment_mas"], + binary_loss_weight=self.binary_loss_weight ) # compute duration error durations_pred = outputs["durations"] @@ -672,17 +673,12 @@ def _create_logs(self, batch, outputs, ap): # plot pitch figures if self.args.use_pitch: - pitch = batch["pitch"] - pitch_avg_expanded, _ = self.expand_encoder_outputs( - outputs["pitch_avg"], outputs["durations"], outputs["x_mask"], outputs["y_mask"] - ) - pitch = pitch[0, 0].data.cpu().numpy() - # TODO: denormalize before plotting - pitch = abs(pitch) - pitch_avg_expanded = abs(pitch_avg_expanded[0, 0]).data.cpu().numpy() + pitch_avg = abs(outputs["pitch_avg_gt"][0, 0].data.cpu().numpy()) + pitch_avg_hat = abs(outputs["pitch_avg"][0, 0].data.cpu().numpy()) + chars = self.tokenizer.decode(batch["text_input"][0].data.cpu().numpy()) pitch_figures = { - "pitch_ground_truth": plot_pitch(pitch, gt_spec, ap, output_fig=False), - "pitch_avg_predicted": plot_pitch(pitch_avg_expanded, pred_spec, ap, output_fig=False), + "pitch_ground_truth": plot_avg_pitch(pitch_avg, chars, output_fig=False), + "pitch_avg_predicted": plot_avg_pitch(pitch_avg_hat, chars, output_fig=False), } figures.update(pitch_figures) @@ -725,9 +721,8 @@ def get_criterion(self): return ForwardTTSLoss(self.config) def on_train_step_start(self, trainer): - """Enable binary alignment loss when needed""" - if trainer.total_steps_done > self.config.binary_align_loss_start_step: - self.use_binary_alignment_loss = True + """Schedule binary loss weight.""" + self.binary_loss_weight = min(trainer.epochs_done / self.config.binary_loss_warmup_epochs, 1.0) * 1.0 @staticmethod def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None): From c2d5be53886984d023b4f6e26dadb24d3e8b3a07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 09:28:48 +0000 Subject: [PATCH 59/67] Fix dataset preprocessing --- TTS/tts/datasets/dataset.py | 67 ++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 99d9429937..10fd1696ae 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -1,4 +1,5 @@ import collections +from email.mime import audio import os import random from typing import Dict, List, Union @@ -140,8 +141,6 @@ def __init__( self.pitch_computed = False self.tokenizer = tokenizer - self.audio_lengths, self.text_lengths = self.compute_lengths(self.samples) - if self.tokenizer.use_phonemes: self.phoneme_dataset = PhonemeDataset( self.samples, self.tokenizer, phoneme_cache_path, precompute_num_workers=precompute_num_workers @@ -254,16 +253,14 @@ def load_data(self, idx): return sample @staticmethod - def compute_lengths(samples): - audio_lengths = [] - text_lengths = [] + def _compute_lengths(samples): + new_samples = [] for item in samples: text, wav_file, *_ = _parse_sample(item) - audio_lengths.append(os.path.getsize(wav_file) / 16 * 8) # assuming 16bit audio - text_lengths.append(len(text)) - audio_lengths = np.array(audio_lengths) - text_lengths = np.array(text_lengths) - return audio_lengths, text_lengths + audio_length = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio + text_lenght = len(text) + new_samples += [item + [audio_length, text_lenght]] + return new_samples @staticmethod def filter_by_length(lengths: List[int], min_len: int, max_len: int): @@ -279,8 +276,9 @@ def filter_by_length(lengths: List[int], min_len: int, max_len: int): return ignore_idx, keep_idx @staticmethod - def sort_by_length(lengths: List[int]): - idxs = np.argsort(lengths) # ascending order + def sort_by_length(samples: List[List]): + audio_lengths = [s[-2] for s in samples] + idxs = np.argsort(audio_lengths) # ascending order return idxs @staticmethod @@ -294,39 +292,38 @@ def create_buckets(samples, batch_group_size: int): samples[offset:end_offset] = temp_items return samples - def select_samples_by_idx(self, idxs): - samples = [] - audio_lengths = [] - text_lengths = [] + def _select_samples_by_idx(self, idxs, samples): + samples_new = [] for idx in idxs: - samples.append(self.samples[idx]) - audio_lengths.append(self.audio_lengths[idx]) - text_lengths.append(self.text_lengths[idx]) - return samples, audio_lengths, text_lengths + samples_new.append(samples[idx]) + return samples_new def preprocess_samples(self): r"""Sort `items` based on text length or audio length in ascending order. Filter out samples out or the length range. """ + samples = self._compute_lengths(self.samples) # sort items based on the sequence length in ascending order - text_ignore_idx, text_keep_idx = self.filter_by_length(self.text_lengths, self.min_text_len, self.max_text_len) + text_lengths = [i[-1] for i in samples] + audio_lengths = [i[-2] for i in samples] + text_ignore_idx, text_keep_idx = self.filter_by_length(text_lengths, self.min_text_len, self.max_text_len) audio_ignore_idx, audio_keep_idx = self.filter_by_length( - self.audio_lengths, self.min_audio_len, self.max_audio_len + audio_lengths, self.min_audio_len, self.max_audio_len ) - keep_idx = list(set(audio_keep_idx) | set(text_keep_idx)) + keep_idx = list(set(audio_keep_idx) & set(text_keep_idx)) ignore_idx = list(set(audio_ignore_idx) | set(text_ignore_idx)) - samples, audio_lengths, _ = self.select_samples_by_idx(keep_idx) + samples = self._select_samples_by_idx(keep_idx, samples) - sorted_idxs = self.sort_by_length(audio_lengths) + sorted_idxs = self.sort_by_length(samples) if self.start_by_longest: longest_idxs = sorted_idxs[-1] sorted_idxs[-1] = sorted_idxs[0] sorted_idxs[0] = longest_idxs - samples, audio_lengths, text_lengtsh = self.select_samples_by_idx(sorted_idxs) + samples = self._select_samples_by_idx(sorted_idxs, samples) if len(samples) == 0: raise RuntimeError(" [!] No samples left") @@ -338,19 +335,19 @@ def preprocess_samples(self): samples = self.create_buckets(samples, self.batch_group_size) # update items to the new sorted items - self.samples = samples - self.audio_lengths = audio_lengths - self.text_lengths = text_lengtsh + audio_lengths = [s[-2] for s in samples] + text_lengths = [s[-1] for s in samples] + self.samples = [s[:-2] for s in samples] if self.verbose: print(" | > Preprocessing samples") - print(" | > Max text length: {}".format(np.max(self.text_lengths))) - print(" | > Min text length: {}".format(np.min(self.text_lengths))) - print(" | > Avg text length: {}".format(np.mean(self.text_lengths))) + print(" | > Max text length: {}".format(np.max(text_lengths))) + print(" | > Min text length: {}".format(np.min(text_lengths))) + print(" | > Avg text length: {}".format(np.mean(text_lengths))) print(" | ") - print(" | > Max audio length: {}".format(np.max(self.audio_lengths))) - print(" | > Min audio length: {}".format(np.min(self.audio_lengths))) - print(" | > Avg audio length: {}".format(np.mean(self.audio_lengths))) + print(" | > Max audio length: {}".format(np.max(audio_lengths))) + print(" | > Min audio length: {}".format(np.min(audio_lengths))) + print(" | > Avg audio length: {}".format(np.mean(audio_lengths))) print(f" | > Num. instances discarded samples: {len(ignore_idx)}") print(" | > Batch group size: {}.".format(self.batch_group_size)) From ad983065f71fc96591a92c1086a4fce59eb9359e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 09:29:21 +0000 Subject: [PATCH 60/67] Update FastPitchConfig --- TTS/tts/configs/fast_pitch_config.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py index 8f0631028a..de87038807 100644 --- a/TTS/tts/configs/fast_pitch_config.py +++ b/TTS/tts/configs/fast_pitch_config.py @@ -89,12 +89,9 @@ class FastPitchConfig(BaseTTSConfig): pitch_loss_alpha (float): Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0. - binary_loss_alpha (float): + binary_align_loss_alpha (float): Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0. - binary_align_loss_start_step (int): - Start binary alignment loss after this many steps. Defaults to 20000. - min_seq_len (int): Minimum input sequence length to be used at training. @@ -129,12 +126,12 @@ class FastPitchConfig(BaseTTSConfig): duration_loss_type: str = "mse" use_ssim_loss: bool = True ssim_loss_alpha: float = 1.0 - dur_loss_alpha: float = 1.0 spec_loss_alpha: float = 1.0 - pitch_loss_alpha: float = 1.0 aligner_loss_alpha: float = 1.0 - binary_align_loss_alpha: float = 1.0 - binary_align_loss_start_step: int = 20000 + pitch_loss_alpha: float = 0.1 + dur_loss_alpha: float = 0.1 + binary_align_loss_alpha: float = 0.1 + binary_loss_warmup_epochs: int = 150 # overrides min_seq_len: int = 13 From f966a459e77ef06f16c045c47e1579106ac9a9da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 10:40:29 +0000 Subject: [PATCH 61/67] Make style --- TTS/bin/find_unique_phonemes.py | 1 - TTS/config/shared_configs.py | 9 +++++++++ TTS/tts/datasets/dataset.py | 10 ++++------ TTS/tts/layers/losses.py | 6 ++++-- TTS/tts/models/forward_tts.py | 2 +- TTS/tts/models/vits.py | 10 ++++++---- TTS/tts/utils/helpers.py | 12 ++++++++---- TTS/tts/utils/languages.py | 3 ++- TTS/tts/utils/visual.py | 2 +- TTS/utils/synthesizer.py | 4 +--- TTS/vocoder/models/gan.py | 4 ++-- tests/data_tests/test_loader.py | 2 +- tests/tts_tests/test_helpers.py | 6 +++--- 13 files changed, 42 insertions(+), 29 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index e84c17de2f..10c7110750 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -9,7 +9,6 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut - phonemizer = Gruut(language="en-us") diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 217282adb0..392f10af56 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -57,6 +57,12 @@ class BaseAudioConfig(Coqpit): do_amp_to_db_mel (bool, optional): enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. + pitch_fmax (float, optional): + Maximum frequency of the F0 frames. Defaults to ```640```. + + pitch_fmin (float, optional): + Minimum frequency of the F0 frames. Defaults to ```0```. + trim_db (int): Silence threshold used for silence trimming. Defaults to 45. @@ -135,6 +141,9 @@ class BaseAudioConfig(Coqpit): spec_gain: int = 20 do_amp_to_db_linear: bool = True do_amp_to_db_mel: bool = True + # f0 params + pitch_fmax: float = 640.0 + pitch_fmin: float = 0.0 # normalization params signal_norm: bool = True min_level_db: int = -100 diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 10fd1696ae..0bcc554c51 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -1,5 +1,4 @@ import collections -from email.mime import audio import os import random from typing import Dict, List, Union @@ -257,7 +256,7 @@ def _compute_lengths(samples): new_samples = [] for item in samples: text, wav_file, *_ = _parse_sample(item) - audio_length = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio + audio_length = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio text_lenght = len(text) new_samples += [item + [audio_length, text_lenght]] return new_samples @@ -292,7 +291,8 @@ def create_buckets(samples, batch_group_size: int): samples[offset:end_offset] = temp_items return samples - def _select_samples_by_idx(self, idxs, samples): + @staticmethod + def _select_samples_by_idx(idxs, samples): samples_new = [] for idx in idxs: samples_new.append(samples[idx]) @@ -308,9 +308,7 @@ def preprocess_samples(self): text_lengths = [i[-1] for i in samples] audio_lengths = [i[-2] for i in samples] text_ignore_idx, text_keep_idx = self.filter_by_length(text_lengths, self.min_text_len, self.max_text_len) - audio_ignore_idx, audio_keep_idx = self.filter_by_length( - audio_lengths, self.min_audio_len, self.max_audio_len - ) + audio_ignore_idx, audio_keep_idx = self.filter_by_length(audio_lengths, self.min_audio_len, self.max_audio_len) keep_idx = list(set(audio_keep_idx) & set(text_keep_idx)) ignore_idx = list(set(audio_ignore_idx) | set(text_ignore_idx)) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 75320d1078..b7c8f6e458 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -740,7 +740,7 @@ def forward( alignment_logprob=None, alignment_hard=None, alignment_soft=None, - binary_loss_weight=None + binary_loss_weight=None, ): loss = 0 return_dict = {} @@ -774,7 +774,9 @@ def forward( binary_alignment_loss = self._binary_alignment_loss(alignment_hard, alignment_soft) loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss if binary_loss_weight: - return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight + return_dict["loss_binary_alignment"] = ( + self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight + ) else: return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index bb8640a3dd..8d554f767f 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -646,7 +646,7 @@ def train_step(self, batch: dict, criterion: nn.Module): alignment_logprob=outputs["alignment_logprob"] if self.use_aligner else None, alignment_soft=outputs["alignment_soft"], alignment_hard=outputs["alignment_mas"], - binary_loss_weight=self.binary_loss_weight + binary_loss_weight=self.binary_loss_weight, ) # compute duration error durations_pred = outputs["durations"] diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 301ddfcd6e..acd5c1729d 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -361,7 +361,9 @@ def __init__( ) upsample_rate = math.prod(self.args.upsample_rates_decoder) - assert upsample_rate == self.config.audio.hop_length, f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {self.config.audio.hop_length}" + assert ( + upsample_rate == self.config.audio.hop_length + ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {self.config.audio.hop_length}" self.waveform_decoder = HifiganGenerator( self.args.hidden_channels, 1, @@ -671,7 +673,7 @@ def forward( waveform, slice_ids * self.config.audio.hop_length, self.args.spec_segment_size * self.config.audio.hop_length, - pad_short=True + pad_short=True, ) if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None: @@ -693,7 +695,7 @@ def forward( outputs.update( { "model_outputs": o, - "alignments" : attn.squeeze(1), + "alignments": attn.squeeze(1), "m_p": m_p, "logs_p": logs_p, "z": z, @@ -949,7 +951,7 @@ def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int): return self.train_step(batch, criterion, optimizer_idx) def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: - figures, audios = self._log(self.ap, batch, outputs, "eval") + figures, audios = self._log(self.ap, batch, outputs, "eval") logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index 3251337768..c2e7f56146 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -68,7 +68,7 @@ def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_ """ # pad the input tensor if it is shorter than the segment size if pad_short and x.shape[-1] < segment_size: - x = torch.nn.functional.pad(x, (0, segment_size - x.size(2))) + x = torch.nn.functional.pad(x, (0, segment_size - x.size(2))) segments = torch.zeros_like(x[:, :, :segment_size]) @@ -78,12 +78,14 @@ def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_ x_i = x[i] if pad_short and index_end > x.size(2): # pad the sample if it is shorter than the segment size - x_i = torch.nn.functional.pad(x_i, (0, (index_end + 1) - x.size(2))) + x_i = torch.nn.functional.pad(x_i, (0, (index_end + 1) - x.size(2))) segments[i] = x_i[:, index_start:index_end] return segments -def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4, let_short_samples=False, pad_short=False): +def rand_segments( + x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4, let_short_samples=False, pad_short=False +): """Create random segments based on the input lengths. Args: @@ -110,7 +112,9 @@ def rand_segments(x: torch.tensor, x_lengths: torch.tensor = None, segment_size= _x_lenghts[len_diff < 0] = segment_size len_diff = _x_lenghts - segment_size + 1 else: - assert all(len_diff > 0), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + assert all( + len_diff > 0 + ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" segment_indices = (torch.rand([B]).type_as(x) * len_diff).long() ret = segment(x, segment_indices, segment_size) return ret, segment_indices diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 8f14d71735..6c1f63f087 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,7 +1,6 @@ import json import os from typing import Dict, List -from TTS.config import check_config_and_model_args import fsspec import numpy as np @@ -9,6 +8,8 @@ from coqpit import Coqpit from torch.utils.data.sampler import WeightedRandomSampler +from TTS.config import check_config_and_model_args + class LanguageManager: """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information diff --git a/TTS/tts/utils/visual.py b/TTS/tts/utils/visual.py index 4fd1f19cb8..78c1298109 100644 --- a/TTS/tts/utils/visual.py +++ b/TTS/tts/utils/visual.py @@ -104,7 +104,7 @@ def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False): fig, ax = plt.subplots() x = np.array(range(len(chars))) - my_xticks = [c for c in chars] + my_xticks = chars plt.xticks(x, my_xticks) ax.set_xlabel("characters") diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index ddc2a6a545..6821e975b0 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -5,10 +5,8 @@ import pysbd import torch -from TTS.config import check_config_and_model_args, get_from_config_or_model_args_with_default, load_config +from TTS.config import load_config from TTS.tts.models import setup_model as setup_tts_model -from TTS.tts.utils.languages import LanguageManager -from TTS.tts.utils.speakers import SpeakerManager # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 7e03e94f2e..6978f0e798 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -19,7 +19,7 @@ class GAN(BaseVocoder): - def __init__(self, config: Coqpit, ap: AudioProcessor=None): + def __init__(self, config: Coqpit, ap: AudioProcessor = None): """Wrap a generator and a discriminator network. It provides a compatible interface for the trainer. It also helps mixing and matching different generator and disciminator networks easily. @@ -306,7 +306,7 @@ def format_batch(batch: List) -> Dict: x, y = batch return {"input": x, "waveform": y} - def get_data_loader( # pylint: disable=no-self-use + def get_data_loader( # pylint: disable=no-self-use, unused-argument self, config: Coqpit, assets: Dict, diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 477ee71fb0..75245ab8b7 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -52,7 +52,7 @@ def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): max_text_len=c.max_text_len, min_audio_len=c.min_audio_len, max_audio_len=c.max_audio_len, - start_by_longest=start_by_longest + start_by_longest=start_by_longest, ) dataloader = DataLoader( dataset, diff --git a/tests/tts_tests/test_helpers.py b/tests/tts_tests/test_helpers.py index 708ecbf50e..23bb440a0a 100644 --- a/tests/tts_tests/test_helpers.py +++ b/tests/tts_tests/test_helpers.py @@ -1,6 +1,6 @@ import torch as T -from TTS.tts.utils.helpers import average_over_durations, generate_path, segment, sequence_mask, rand_segments +from TTS.tts.utils.helpers import average_over_durations, generate_path, rand_segments, segment, sequence_mask def average_over_durations_test(): # pylint: disable=no-self-use @@ -57,12 +57,12 @@ def rand_segments_test(): assert segments.shape == (2, 3, 3) assert all(seg_idxs >= 0), seg_idxs try: - segments, _ = rand_segments(x, x_lens, segment_size=5) + segments, _ = rand_segments(x, x_lens, segment_size=5) raise Exception("Should have failed") except: pass x_lens_back = x_lens.clone() - segments, seg_idxs= rand_segments(x, x_lens.clone(), segment_size=5, pad_short=True, let_short_samples=True) + segments, seg_idxs = rand_segments(x, x_lens.clone(), segment_size=5, pad_short=True, let_short_samples=True) assert segments.shape == (2, 3, 5) assert all(seg_idxs >= 0), seg_idxs assert all(x_lens_back == x_lens) From 153c875d79d5f0e92bdff0b13e66bb3c68cfa589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 Jan 2022 10:41:20 +0000 Subject: [PATCH 62/67] Update AnalyzeDataset notebook --- .../dataset_analysis/AnalyzeDataset.ipynb | 76 ++++++++++++------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb index c2aabbf96a..e08f3ab356 100644 --- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -8,7 +8,7 @@ }, "outputs": [], "source": [ - "TTS_PATH = \"/home/erogol/projects/\"" + "# TTS_PATH = \"/home/erogol/projects/\"" ] }, { @@ -21,7 +21,6 @@ "source": [ "import os\n", "import sys\n", - "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", "import librosa\n", "import numpy as np\n", "import pandas as pd\n", @@ -30,6 +29,8 @@ "from multiprocessing import Pool\n", "from matplotlib import pylab as plt\n", "from collections import Counter\n", + "from TTS.config.shared_configs import BaseDatasetConfig\n", + "from TTS.tts.datasets import load_tts_samples\n", "from TTS.tts.datasets.formatters import *\n", "%matplotlib inline" ] @@ -42,22 +43,29 @@ }, "outputs": [], "source": [ - "DATA_PATH = \"/home/erogol/Data/m-ai-labs/de_DE/by_book/male/karlsson/\"\n", - "META_DATA = [\"kleinzaches/metadata.csv\",\n", - " \"spiegel_kaetzchen/metadata.csv\",\n", - " \"herrnarnesschatz/metadata.csv\",\n", - " \"maedchen_von_moorhof/metadata.csv\",\n", - " \"koenigsgaukler/metadata.csv\",\n", - " \"altehous/metadata.csv\",\n", - " \"odysseus/metadata.csv\",\n", - " \"undine/metadata.csv\",\n", - " \"reise_tilsit/metadata.csv\",\n", - " \"schmied_seines_glueckes/metadata.csv\",\n", - " \"kammmacher/metadata.csv\",\n", - " \"unterm_birnbaum/metadata.csv\",\n", - " \"liebesbriefe/metadata.csv\",\n", - " \"sandmann/metadata.csv\"]\n", - "NUM_PROC = 8" + "NUM_PROC = 8\n", + "DATASET_CONFIG = BaseDatasetConfig(\n", + " name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/home/ubuntu/TTS/depot/data/male_dataset1_44k/\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument\n", + " txt_file = os.path.join(root_path, meta_file)\n", + " items = []\n", + " speaker_name = \"maledataset1\"\n", + " with open(txt_file, \"r\", encoding=\"utf-8\") as ttf:\n", + " for line in ttf:\n", + " cols = line.split(\"|\")\n", + " wav_file = os.path.join(root_path, \"wavs\", cols[0])\n", + " text = cols[1]\n", + " items.append([text, wav_file, speaker_name])\n", + " return items" ] }, { @@ -69,8 +77,10 @@ "outputs": [], "source": [ "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n", - "items = mailabs(DATA_PATH, META_DATA)\n", - "print(\" > Number of audio files: {}\".format(len(items)))" + "train_samples, eval_samples = load_tts_samples(DATASET_CONFIG, eval_split=True, formatter=formatter)\n", + "items = train_samples + eval_samples\n", + "print(\" > Number of audio files: {}\".format(len(items)))\n", + "print(items[1])" ] }, { @@ -103,6 +113,15 @@ "print([item for item, count in c.items() if count > 1])" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "item" + ] + }, { "cell_type": "code", "execution_count": null, @@ -112,11 +131,9 @@ "outputs": [], "source": [ "def load_item(item):\n", - " file_name = item[1].strip()\n", " text = item[0].strip()\n", - " audio = librosa.load(file_name, sr=None)\n", - " sr = audio[1]\n", - " audio = audio[0]\n", + " file_name = item[1].strip()\n", + " audio, sr = librosa.load(file_name, sr=None)\n", " audio_len = len(audio) / sr\n", " text_len = len(text)\n", " return file_name, text, text_len, audio, audio_len\n", @@ -374,11 +391,18 @@ "# fequency bar plot - it takes time!!\n", "w_count_df.plot.bar()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -392,7 +416,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.1" } }, "nbformat": 4, From f91220690b7fb7fcaacf900eaec95cda1309c25f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 Jan 2022 10:20:07 +0100 Subject: [PATCH 63/67] Load right char class dynamically --- TTS/tts/utils/text/tokenizer.py | 21 ++++++++++++++++----- TTS/utils/generic_utils.py | 27 +++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index 80be368d48..bdaf8ea64b 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -3,6 +3,7 @@ from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name +from TTS.utils.generic_utils import get_import_path, import_class class TTSTokenizer: @@ -152,15 +153,25 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): # init characters if characters is None: - if config.use_phonemes: - # init phoneme set - characters, new_config = IPAPhonemes().init_from_config(config) + # set characters based on defined characters class + if config.characters and config.characters.characters_class: + CharactersClass = import_class(config.characters.characters_class) + characters, new_config = CharactersClass.init_from_config(config) + # set characters based on config else: - # init character set - characters, new_config = Graphemes().init_from_config(config) + if config.use_phonemes: + # init phoneme set + characters, new_config = IPAPhonemes().init_from_config(config) + else: + # init character set + characters, new_config = Graphemes().init_from_config(config) + else: characters, new_config = characters.init_from_config(config) + # set characters class + new_config.characters.characters_class = get_import_path(characters) + # init phonemizer phonemizer = None if config.use_phonemes: diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 6504cca622..69609bcbf3 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -95,6 +95,33 @@ def find_module(module_path: str, module_name: str) -> object: return getattr(module, class_name) +def import_class(module_path: str) -> object: + """Import a class from a module path. + + Args: + module_path (str): The module path of the class. + + Returns: + object: The imported class. + """ + class_name = module_path.split(".")[-1] + module_path = ".".join(module_path.split(".")[:-1]) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def get_import_path(obj: object) -> str: + """Get the import path of a class. + + Args: + obj (object): The class object. + + Returns: + str: The import path of the class. + """ + return ".".join([type(obj).__module__, type(obj).__name__]) + + def get_user_data_dir(appname): if sys.platform == "win32": import winreg # pylint: disable=import-outside-toplevel From 0b8acaf81c3e91a768d43bae4d3af825ec11a2b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 Jan 2022 10:22:12 +0100 Subject: [PATCH 64/67] Add new speakers to the vits model --- TTS/tts/models/vits.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index acd5c1729d..be24bcafeb 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -654,12 +654,7 @@ def forward( z_p = self.flow(z, y_mask, g=g) # duration predictor - if self.args.use_mas: - outputs, attn = self.forward_mas(outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g=g, lang_emb=lang_emb) - elif self.args.use_aligner_network: - outputs, attn = self.forward_aligner(outputs, m_p, z_p, x_mask, y_mask, g=g, lang_emb=lang_emb) - outputs["x_lens"] = x_lengths - outputs["y_lens"] = y_lengths + outputs, attn = self.forward_mas(outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g=g, lang_emb=lang_emb) # expand prior m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p]) @@ -1057,7 +1052,15 @@ def load_checkpoint( # TODO: consider baking the speaker encoder into the model and call it from there. # as it is probably easier for model distribution. state["model"] = {k: v for k, v in state["model"].items() if "speaker_encoder" not in k} - self.load_state_dict(state["model"]) + # handle fine-tuning from a checkpoint with additional speakers + if state["model"]["emb_g.weight"].shape != self.emb_g.weight.shape: + print(" > Loading checkpoint with additional speakers.") + emb_g = state["model"]["emb_g.weight"] + new_row = torch.zeros(1, emb_g.shape[1]) + emb_g = torch.cat([emb_g, new_row], axis=0) + state["model"]["emb_g.weight"] = emb_g + + self.load_state_dict(state["model"], strict=False) if eval: self.eval() assert not self.training From a164485ab709e48999e2871561d793849133a884 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 Jan 2022 10:23:22 +0100 Subject: [PATCH 65/67] Fix up --- TTS/tts/configs/shared_configs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 09266ce2e7..3c450cea64 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -53,6 +53,10 @@ class CharactersConfig(Coqpit): """Defines arguments for the `BaseCharacters` and its subclasses. Args: + characters_class (str): + Defines the class of the characters used. If None, we pick ```Phonemes``` or ```Graphemes``` based on + the configuration. Defaults to None. + pad (str): characters in place of empty padding. Defaults to None. @@ -84,6 +88,7 @@ class CharactersConfig(Coqpit): Sort the characters in alphabetical order. Defaults to True. """ + characters_class: str = None pad: str = None eos: str = None bos: str = None From 6c55245b2854bf5cfbceb84c5cb75ed9a9e517ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 Jan 2022 10:23:52 +0100 Subject: [PATCH 66/67] Fix VCTK VITS recipe --- recipes/vctk/vits/train_vits.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index 2906557dde..caf1caa100 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -57,9 +57,7 @@ print_step=25, print_eval=False, mixed_precision=True, - sort_by_audio_len=True, - min_seq_len=32 * 256 * 4, - max_seq_len=1500000, + max_text_len= 325, # change this if you have a larger VRAM than 16GB output_path=output_path, datasets=[dataset_config], ) From a68fb7667757020068ea189a837d0f40979c84ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 Jan 2022 13:50:58 +0100 Subject: [PATCH 67/67] Set `drop_last` --- TTS/tts/models/base_tts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 7cdfa915df..0eb2b5f311 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -324,9 +324,9 @@ def get_data_loader( loader = DataLoader( dataset, batch_size=config.eval_batch_size if is_eval else config.batch_size, - shuffle=False, + shuffle=False, # shuffle is done in the dataset. collate_fn=dataset.collate_fn, - drop_last=False, + drop_last=True, # setting this False might cause issues in AMP training. sampler=sampler, num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, pin_memory=False,