From a365a7e888454ac68e24fb787a7bab22ed6cdb73 Mon Sep 17 00:00:00 2001 From: p0p4k Date: Mon, 13 Feb 2023 18:34:00 +0900 Subject: [PATCH 1/7] numpy version for py310 (#2316) * numpy version for py310 requested in #2315 * Update requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7ee23dab5a..638897c126 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ # core deps numpy==1.21.6;python_version<"3.10" -numpy==1.22.4;python_version=="3.10" +numpy;python_version=="3.10" cython==0.29.28 scipy>=1.4.0 torch>=1.7 @@ -39,4 +39,4 @@ gruut[de]==2.2.3 # deps for korean jamo nltk -g2pkk>=0.1.1 \ No newline at end of file +g2pkk>=0.1.1 From 16b98622521a6ce3fbaba822dc68f82c9030c9d0 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 27 Feb 2023 03:14:00 -0300 Subject: [PATCH 2/7] Fix Speaker Consistency Loss (SCL) (#2364) --- TTS/encoder/models/resnet.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py index e75ab6c463..5eafcd6005 100644 --- a/TTS/encoder/models/resnet.py +++ b/TTS/encoder/models/resnet.py @@ -161,16 +161,14 @@ def forward(self, x, l2_norm=False): Shapes: - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` """ - with torch.no_grad(): - with torch.cuda.amp.autocast(enabled=False): - x.squeeze_(1) - # if you torch spec compute it otherwise use the mel spec computed by the AP - if self.use_torch_spec: - x = self.torch_spec(x) - - if self.log_input: - x = (x + 1e-6).log() - x = self.instancenorm(x).unsqueeze(1) + x.squeeze_(1) + # if you torch spec compute it otherwise use the mel spec computed by the AP + if self.use_torch_spec: + x = self.torch_spec(x) + + if self.log_input: + x = (x + 1e-6).log() + x = self.instancenorm(x).unsqueeze(1) x = self.conv1(x) x = self.relu(x) From d39bc74f57b4b5c28dde88e1f66f6072199d35d3 Mon Sep 17 00:00:00 2001 From: thennal10 Date: Wed, 1 Mar 2023 13:41:30 +0530 Subject: [PATCH 3/7] OverFlow with test sentences (#2253) * Fix typo in function definiton * Swap hasattr out hasattr(self, "speaker_manager") and hasattr(self, "language_manager") seems to be redundant since BaseTTS defines both. --- TTS/tts/models/base_tts.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 2059612de6..dc53edd078 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -111,7 +111,7 @@ def get_aux_input(self, **kwargs) -> Dict: """Prepare and return `aux_input` used by `forward()`""" return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} - def get_aux_input_from_test_setences(self, sentence_info): + def get_aux_input_from_test_sentences(self, sentence_info): if hasattr(self.config, "model_args"): config = self.config.model_args else: @@ -134,7 +134,7 @@ def get_aux_input_from_test_setences(self, sentence_info): # get speaker id/d_vector speaker_id, d_vector, language_id = None, None, None - if hasattr(self, "speaker_manager"): + if self.speaker_manager is not None: if config.use_d_vector_file: if speaker_name is None: d_vector = self.speaker_manager.get_random_embedding() @@ -147,7 +147,7 @@ def get_aux_input_from_test_setences(self, sentence_info): speaker_id = self.speaker_manager.name_to_id[speaker_name] # get language id - if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: + if self.language_manager is not None and config.use_language_embedding and language_name is not None: language_id = self.language_manager.name_to_id[language_name] return { @@ -287,7 +287,7 @@ def get_data_loader( loader = None else: # setup multi-speaker attributes - if hasattr(self, "speaker_manager") and self.speaker_manager is not None: + if self.speaker_manager is not None: if hasattr(config, "model_args"): speaker_id_mapping = ( self.speaker_manager.name_to_id if config.model_args.use_speaker_embedding else None @@ -302,7 +302,7 @@ def get_data_loader( d_vector_mapping = None # setup multi-lingual attributes - if hasattr(self, "language_manager") and self.language_manager is not None: + if self.language_manager is not None: language_id_mapping = self.language_manager.name_to_id if self.args.use_language_embedding else None else: language_id_mapping = None @@ -424,7 +424,7 @@ def on_init_start(self, trainer): print(f" > `speakers.pth` is saved to {output_path}.") print(" > `speakers_file` is updated in the config.json.") - if hasattr(self, "language_manager") and self.language_manager is not None: + if self.language_manager is not None: output_path = os.path.join(trainer.output_path, "language_ids.json") self.language_manager.save_ids_to_file(output_path) trainer.config.language_ids_file = output_path From 478c8178b88921f17ed39ef19cd85be5c2ab5c81 Mon Sep 17 00:00:00 2001 From: Florian Quirin Date: Mon, 6 Mar 2023 10:08:21 +0100 Subject: [PATCH 4/7] Basic Mary-TTS API compatibility (#2352) * added basic Mary-TTS API endpoints to server - imported `parse_qs` from `urllib.parse` to parse HTTP POST parameters - imported `render_template_string` from `flask` to return text as endpoint result - added new routes: - `/locales` - returns list of locales (currently locale of active model) - `/voices` - returns list of voices (currently locale and name of active model) - `/process` - accepts synth. request (GET and POST) with parameter `INPUT_TEXT` (other parameters ignored since we have only one active model) * better log messages for Mary-TTS API - smaller tweaks to log output * use f-string in log print to please linter * updated server.py to match 'make style' result --- TTS/server/server.py | 53 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/TTS/server/server.py b/TTS/server/server.py index c276a142ff..7324e80111 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -7,8 +7,9 @@ from pathlib import Path from threading import Lock from typing import Union +from urllib.parse import parse_qs -from flask import Flask, render_template, request, send_file +from flask import Flask, render_template, render_template_string, request, send_file from TTS.config import load_config from TTS.utils.manage import ModelManager @@ -187,15 +188,59 @@ def tts(): language_idx = request.args.get("language_id", "") style_wav = request.args.get("style_wav", "") style_wav = style_wav_uri_to_dict(style_wav) - print(" > Model input: {}".format(text)) - print(" > Speaker Idx: {}".format(speaker_idx)) - print(" > Language Idx: {}".format(language_idx)) + print(f" > Model input: {text}") + print(f" > Speaker Idx: {speaker_idx}") + print(f" > Language Idx: {language_idx}") wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav) out = io.BytesIO() synthesizer.save_wav(wavs, out) return send_file(out, mimetype="audio/wav") +# Basic MaryTTS compatibility layer + + +@app.route("/locales", methods=["GET"]) +def mary_tts_api_locales(): + """MaryTTS-compatible /locales endpoint""" + # NOTE: We currently assume there is only one model active at the same time + if args.model_name is not None: + model_details = args.model_name.split("/") + else: + model_details = ["", "en", "", "default"] + return render_template_string("{{ locale }}\n", locale=model_details[1]) + + +@app.route("/voices", methods=["GET"]) +def mary_tts_api_voices(): + """MaryTTS-compatible /voices endpoint""" + # NOTE: We currently assume there is only one model active at the same time + if args.model_name is not None: + model_details = args.model_name.split("/") + else: + model_details = ["", "en", "", "default"] + return render_template_string( + "{{ name }} {{ locale }} {{ gender }}\n", name=model_details[3], locale=model_details[1], gender="u" + ) + + +@app.route("/process", methods=["GET", "POST"]) +def mary_tts_api_process(): + """MaryTTS-compatible /process endpoint""" + with lock: + if request.method == "POST": + data = parse_qs(request.get_data(as_text=True)) + # NOTE: we ignore param. LOCALE and VOICE for now since we have only one active model + text = data.get("INPUT_TEXT", [""])[0] + else: + text = request.args.get("INPUT_TEXT", "") + print(f" > Model input: {text}") + wavs = synthesizer.tts(text) + out = io.BytesIO() + synthesizer.save_wav(wavs, out) + return send_file(out, mimetype="audio/wav") + + def main(): app.run(debug=args.debug, host="::", port=args.port) From 624513018d8467e30f39bd67a658216dadd0a41a Mon Sep 17 00:00:00 2001 From: manmay nakhashi Date: Mon, 6 Mar 2023 14:50:25 +0530 Subject: [PATCH 5/7] add energy by default to Fastspeech2 config (#2326) * add energy by default * added energy to base tts * fix energy dataset * fix styles * fix test --- TTS/tts/configs/fastspeech2_config.py | 2 +- TTS/tts/datasets/dataset.py | 18 ++++++++++-------- TTS/tts/models/base_tts.py | 4 ++++ .../test_fastspeech_2_speaker_emb_train.py | 2 +- tests/tts_tests/test_fastspeech_2_train.py | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/TTS/tts/configs/fastspeech2_config.py b/TTS/tts/configs/fastspeech2_config.py index f7ff219a93..68a3eec2f0 100644 --- a/TTS/tts/configs/fastspeech2_config.py +++ b/TTS/tts/configs/fastspeech2_config.py @@ -123,7 +123,7 @@ class Fastspeech2Config(BaseTTSConfig): base_model: str = "forward_tts" # model specific params - model_args: ForwardTTSArgs = ForwardTTSArgs() + model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=True, use_energy=True) # multi-speaker settings num_speakers: int = 0 diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index db74186b9f..df01d66323 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -189,6 +189,8 @@ def samples(self, new_samples): self._samples = new_samples if hasattr(self, "f0_dataset"): self.f0_dataset.samples = new_samples + if hasattr(self, "energy_dataset"): + self.energy_dataset.samples = new_samples if hasattr(self, "phoneme_dataset"): self.phoneme_dataset.samples = new_samples @@ -856,11 +858,11 @@ def __init__( def __getitem__(self, idx): item = self.samples[idx] - energy = self.compute_or_load(item["audio_file"]) + energy = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"])) if self.normalize_energy: assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available" energy = self.normalize(energy) - return {"audio_file": item["audio_file"], "energy": energy} + return {"audio_unique_name": item["audio_unique_name"], "energy": energy} def __len__(self): return len(self.samples) @@ -884,7 +886,7 @@ def precompute(self, num_workers=0): if self.normalize_energy: computed_data = [tensor for batch in computed_data for tensor in batch] # flatten - energy_mean, energy_std = self.compute_pitch_stats(computed_data) + energy_mean, energy_std = self.compute_energy_stats(computed_data) energy_stats = {"mean": energy_mean, "std": energy_std} np.save(os.path.join(self.cache_path, "energy_stats"), energy_stats, allow_pickle=True) @@ -900,7 +902,7 @@ def create_energy_file_path(wav_file, cache_path): @staticmethod def _compute_and_save_energy(ap, wav_file, energy_file=None): wav = ap.load_wav(wav_file) - energy = calculate_energy(wav) + energy = calculate_energy(wav, fft_size=ap.fft_size, hop_length=ap.hop_length, win_length=ap.win_length) if energy_file: np.save(energy_file, energy) return energy @@ -931,11 +933,11 @@ def denormalize(self, energy): energy[zero_idxs] = 0.0 return energy - def compute_or_load(self, wav_file): + def compute_or_load(self, wav_file, audio_unique_name): """ compute energy and return a numpy array of energy values """ - energy_file = self.create_Energy_file_path(wav_file, self.cache_path) + energy_file = self.create_energy_file_path(audio_unique_name, self.cache_path) if not os.path.exists(energy_file): energy = self._compute_and_save_energy(self.ap, wav_file, energy_file) else: @@ -943,14 +945,14 @@ def compute_or_load(self, wav_file): return energy.astype(np.float32) def collate_fn(self, batch): - audio_file = [item["audio_file"] for item in batch] + audio_unique_name = [item["audio_unique_name"] for item in batch] energys = [item["energy"] for item in batch] energy_lens = [len(item["energy"]) for item in batch] energy_lens_max = max(energy_lens) energys_torch = torch.LongTensor(len(energys), energy_lens_max).fill_(self.get_pad_id()) for i, energy_len in enumerate(energy_lens): energys_torch[i, :energy_len] = torch.LongTensor(energys[i]) - return {"audio_file": audio_file, "energy": energys_torch, "energy_lens": energy_lens} + return {"audio_unique_name": audio_unique_name, "energy": energys_torch, "energy_lens": energy_lens} def print_logs(self, level: int = 0) -> None: indent = "\t" * level diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index dc53edd078..37a0935465 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -183,6 +183,7 @@ def format_batch(self, batch: Dict) -> Dict: attn_mask = batch["attns"] waveform = batch["waveform"] pitch = batch["pitch"] + energy = batch["energy"] language_ids = batch["language_ids"] max_text_length = torch.max(text_lengths.float()) max_spec_length = torch.max(mel_lengths.float()) @@ -231,6 +232,7 @@ def format_batch(self, batch: Dict) -> Dict: "item_idx": item_idx, "waveform": waveform, "pitch": pitch, + "energy": energy, "language_ids": language_ids, "audio_unique_names": batch["audio_unique_names"], } @@ -313,6 +315,8 @@ def get_data_loader( compute_linear_spec=config.model.lower() == "tacotron" or config.compute_linear_spec, compute_f0=config.get("compute_f0", False), f0_cache_path=config.get("f0_cache_path", None), + compute_energy=config.get("compute_energy", False), + energy_cache_path=config.get("energy_cache_path", None), samples=samples, ap=self.ap, return_wav=config.return_wav if "return_wav" in config else False, diff --git a/tests/tts_tests/test_fastspeech_2_speaker_emb_train.py b/tests/tts_tests/test_fastspeech_2_speaker_emb_train.py index d12f8bedcc..35bda597d5 100644 --- a/tests/tts_tests/test_fastspeech_2_speaker_emb_train.py +++ b/tests/tts_tests/test_fastspeech_2_speaker_emb_train.py @@ -38,7 +38,7 @@ f0_cache_path="tests/data/ljspeech/f0_cache/", compute_f0=True, compute_energy=True, - energy_cache_path="tests/data/ljspeech/f0_cache/", + energy_cache_path="tests/data/ljspeech/energy_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, diff --git a/tests/tts_tests/test_fastspeech_2_train.py b/tests/tts_tests/test_fastspeech_2_train.py index f54e635142..dd4b07d240 100644 --- a/tests/tts_tests/test_fastspeech_2_train.py +++ b/tests/tts_tests/test_fastspeech_2_train.py @@ -38,7 +38,7 @@ f0_cache_path="tests/data/ljspeech/f0_cache/", compute_f0=True, compute_energy=True, - energy_cache_path="tests/data/ljspeech/f0_cache/", + energy_cache_path="tests/data/ljspeech/energy_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, From 4ca07514d4fffe88ad4951af4b8901aea1512d81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Mar 2023 12:42:01 +0100 Subject: [PATCH 6/7] Remove doc bot (#2399) --- docs/source/_templates/page.html | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/docs/source/_templates/page.html b/docs/source/_templates/page.html index 2c6ef4ee96..dd1bc34fa6 100644 --- a/docs/source/_templates/page.html +++ b/docs/source/_templates/page.html @@ -1,23 +1,4 @@ {% extends "!page.html" %} {% block scripts %} {{ super() }} - - - - - - - {% endblock %} From c10f9a3699182a91d4c01afd65143ea39de44382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Mar 2023 12:42:20 +0100 Subject: [PATCH 7/7] Update docs (#2389) * Update docs index * Add MarryTTS docs * Update docs index * Add Overflow docs --- docs/source/index.md | 3 ++- docs/source/marytts.md | 0 docs/source/models/overflow.md | 36 ++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 docs/source/marytts.md create mode 100644 docs/source/models/overflow.md diff --git a/docs/source/index.md b/docs/source/index.md index 3f27ffb862..5173592896 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -28,6 +28,7 @@ formatting_your_dataset what_makes_a_good_dataset tts_datasets + marytts .. toctree:: :maxdepth: 2 @@ -48,10 +49,10 @@ models/vits.md models/forward_tts.md models/tacotron1-2.md + models/overflow.md .. toctree:: :maxdepth: 2 :caption: `vocoder` Models ``` - diff --git a/docs/source/marytts.md b/docs/source/marytts.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/source/models/overflow.md b/docs/source/models/overflow.md new file mode 100644 index 0000000000..09e270eae5 --- /dev/null +++ b/docs/source/models/overflow.md @@ -0,0 +1,36 @@ +# Overflow TTS + +Neural HMMs are a type of neural transducer recently proposed for +sequence-to-sequence modelling in text-to-speech. They combine the best features +of classic statistical speech synthesis and modern neural TTS, requiring less +data and fewer training updates, and are less prone to gibberish output caused +by neural attention failures. In this paper, we combine neural HMM TTS with +normalising flows for describing the highly non-Gaussian distribution of speech +acoustics. The result is a powerful, fully probabilistic model of durations and +acoustics that can be trained using exact maximum likelihood. Compared to +dominant flow-based acoustic models, our approach integrates autoregression for +improved modelling of long-range dependences such as utterance-level prosody. +Experiments show that a system based on our proposal gives more accurate +pronunciations and better subjective speech quality than comparable methods, +whilst retaining the original advantages of neural HMMs. Audio examples and code +are available at https://shivammehta25.github.io/OverFlow/. + + +## Important resources & papers +- HMM: https://de.wikipedia.org/wiki/Hidden_Markov_Model +- OverflowTTS paper: https://arxiv.org/abs/2211.06892 +- Neural HMM: https://arxiv.org/abs/2108.13320 +- Audio Samples: https://shivammehta25.github.io/OverFlow/ + + +## OverflowConfig +```{eval-rst} +.. autoclass:: TTS.tts.configs.overflow_config.OverflowConfig + :members: +``` + +## Overflow Model +```{eval-rst} +.. autoclass:: TTS.tts.models.overflow.Overflow + :members: +``` \ No newline at end of file