From 0a136a8535ca107d6a08aa43c4772d50dd2f6b6b Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 11:29:36 +0100 Subject: [PATCH 01/18] Download speaker file --- TTS/.models.json | 3 ++- TTS/utils/manage.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/TTS/.models.json b/TTS/.models.json index 1957d78adb..c60a04bb63 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -8,7 +8,8 @@ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5" + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers.pth" ], "model_hash": "10f92b55c512af7a8d39d650547a15a7", "default_vocoder": null, diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 463b840242..bfe25443f2 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -315,6 +315,7 @@ def _set_model_item(self, model_name): f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json", f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json", f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5", + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/speakers.pth", ], } else: From 36143fee2625aac6c6b3f7c7d0e0b7d75800167a Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 15:25:46 +0100 Subject: [PATCH 02/18] Add basic speaker manager --- TTS/.models.json | 2 +- TTS/tts/layers/xtts/speaker_manager.py | 9 +++++++++ TTS/tts/models/xtts.py | 7 +++++++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 TTS/tts/layers/xtts/speaker_manager.py diff --git a/TTS/.models.json b/TTS/.models.json index c60a04bb63..30e74f8692 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -3,7 +3,7 @@ "multilingual": { "multi-dataset": { "xtts_v2": { - "description": "XTTS-v2.0.2 by Coqui with 16 languages.", + "description": "XTTS-v2.0.3 by Coqui with 17 languages.", "hf_url": [ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json", diff --git a/TTS/tts/layers/xtts/speaker_manager.py b/TTS/tts/layers/xtts/speaker_manager.py new file mode 100644 index 0000000000..a7d89d18ba --- /dev/null +++ b/TTS/tts/layers/xtts/speaker_manager.py @@ -0,0 +1,9 @@ +import torch + +class SpeakerManager(): + def __init__(self, speaker_file_path=None): + self.speakers = torch.load(speaker_file_path) + + @property + def name_to_id(self): + return self.speakers.keys() diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 6b8cc59101..966f7c0f85 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -11,6 +11,7 @@ from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder from TTS.tts.layers.xtts.stream_generator import init_stream_support from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence +from TTS.tts.layers.xtts.speaker_manager import SpeakerManager from TTS.tts.models.base_tts import BaseTTS from TTS.utils.io import load_fsspec @@ -733,6 +734,7 @@ def load_checkpoint( eval=True, strict=True, use_deepspeed=False, + speaker_file_path=None, ): """ Loads a checkpoint from disk and initializes the model's state and tokenizer. @@ -751,6 +753,11 @@ def load_checkpoint( model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth") vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json") + speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers.json") + + self.speaker_manager = None + if os.path.exists(speaker_file_path): + self.speaker_manager = SpeakerManager(speaker_file_path) if os.path.exists(vocab_path): self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path) From a5c0d9780f7ae57898bc8110fb48c601f0126c94 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 18:48:31 +0100 Subject: [PATCH 03/18] rename manager --- .../layers/xtts/{speaker_manager.py => xtts_manager.py} | 9 +++++++++ 1 file changed, 9 insertions(+) rename TTS/tts/layers/xtts/{speaker_manager.py => xtts_manager.py} (55%) diff --git a/TTS/tts/layers/xtts/speaker_manager.py b/TTS/tts/layers/xtts/xtts_manager.py similarity index 55% rename from TTS/tts/layers/xtts/speaker_manager.py rename to TTS/tts/layers/xtts/xtts_manager.py index a7d89d18ba..2de5ff14c7 100644 --- a/TTS/tts/layers/xtts/speaker_manager.py +++ b/TTS/tts/layers/xtts/xtts_manager.py @@ -7,3 +7,12 @@ def __init__(self, speaker_file_path=None): @property def name_to_id(self): return self.speakers.keys() + + +class LanguageManager(): + def __init__(self, config): + self.langs = config["languages"] + + @property + def name_to_id(self): + return self.langs From 0a90359a421e2c2454f035ade80b2b92d2497b32 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 18:48:49 +0100 Subject: [PATCH 04/18] rename speaker file --- TTS/.models.json | 2 +- TTS/utils/manage.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 30e74f8692..1fee7f87f2 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -9,7 +9,7 @@ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers.pth" + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth" ], "model_hash": "10f92b55c512af7a8d39d650547a15a7", "default_vocoder": null, diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index bfe25443f2..ed7cb2cc07 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -315,7 +315,7 @@ def _set_model_item(self, model_name): f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json", f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json", f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5", - f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/speakers.pth", + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/speakers_xtts.pth", ], } else: From e3c9dab7a3d72d1536106259254cdff31171987a Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 18:49:18 +0100 Subject: [PATCH 05/18] Make CLI work --- TTS/tts/models/xtts.py | 24 ++++++++++++------------ TTS/utils/synthesizer.py | 7 +++++-- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 966f7c0f85..12ed774269 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -11,7 +11,7 @@ from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder from TTS.tts.layers.xtts.stream_generator import init_stream_support from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence -from TTS.tts.layers.xtts.speaker_manager import SpeakerManager +from TTS.tts.layers.xtts.xtts_manager import SpeakerManager, LanguageManager from TTS.tts.models.base_tts import BaseTTS from TTS.utils.io import load_fsspec @@ -379,7 +379,7 @@ def get_conditioning_latents( return gpt_cond_latents, speaker_embedding - def synthesize(self, text, config, speaker_wav, language, **kwargs): + def synthesize(self, text, config, speaker_wav, language, speaker_id, **kwargs): """Synthesize speech with the given input text. Args: @@ -394,12 +394,6 @@ def synthesize(self, text, config, speaker_wav, language, **kwargs): `text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents` as latents used at inference. - """ - return self.inference_with_config(text, config, ref_audio_path=speaker_wav, language=language, **kwargs) - - def inference_with_config(self, text, config, ref_audio_path, language, **kwargs): - """ - inference with config """ assert ( "zh-cn" if language == "zh" else language in self.config.languages @@ -411,13 +405,18 @@ def inference_with_config(self, text, config, ref_audio_path, language, **kwargs "repetition_penalty": config.repetition_penalty, "top_k": config.top_k, "top_p": config.top_p, + } + settings.update(kwargs) # allow overriding of preset settings with kwargs + if speaker_id is not None: + gpt_cond_latent, speaker_embedding = self.speaker_manager.speakers[speaker_id].values() + return self.inference(text, language, gpt_cond_latent, speaker_embedding, **settings) + settings.update({ "gpt_cond_len": config.gpt_cond_len, "gpt_cond_chunk_len": config.gpt_cond_chunk_len, "max_ref_len": config.max_ref_len, "sound_norm_refs": config.sound_norm_refs, - } - settings.update(kwargs) # allow overriding of preset settings with kwargs - return self.full_inference(text, ref_audio_path, language, **settings) + }) + return self.full_inference(text, speaker_wav, language, **settings) @torch.inference_mode() def full_inference( @@ -753,8 +752,9 @@ def load_checkpoint( model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth") vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json") - speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers.json") + speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth") + self.language_manager = LanguageManager(config) self.speaker_manager = None if os.path.exists(speaker_file_path): self.speaker_manager = SpeakerManager(speaker_file_path) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 781561f973..b98647c30c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -305,7 +305,7 @@ def tts( speaker_embedding = None speaker_id = None if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"): - if speaker_name and isinstance(speaker_name, str): + if speaker_name and isinstance(speaker_name, str) and not self.tts_config.model == "xtts": if self.tts_config.use_d_vector_file: # get the average speaker embedding from the saved d_vectors. speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( @@ -335,7 +335,9 @@ def tts( # handle multi-lingual language_id = None if self.tts_languages_file or ( - hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None + hasattr(self.tts_model, "language_manager") + and self.tts_model.language_manager is not None + and not self.tts_config.model == "xtts" ): if len(self.tts_model.language_manager.name_to_id) == 1: language_id = list(self.tts_model.language_manager.name_to_id.values())[0] @@ -366,6 +368,7 @@ def tts( if ( speaker_wav is not None and self.tts_model.speaker_manager is not None + and hasattr(self.tts_model.speaker_manager, "encoder_ap") and self.tts_model.speaker_manager.encoder_ap is not None ): speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) From 5cd750ac7ede641415051c16f1f9c8111caf13de Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 20:21:53 +0100 Subject: [PATCH 06/18] Fix API and CI --- TTS/tts/layers/xtts/xtts_manager.py | 16 ++++++++++++++++ TTS/tts/models/xtts.py | 6 +++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/TTS/tts/layers/xtts/xtts_manager.py b/TTS/tts/layers/xtts/xtts_manager.py index 2de5ff14c7..3e7d0f6c91 100644 --- a/TTS/tts/layers/xtts/xtts_manager.py +++ b/TTS/tts/layers/xtts/xtts_manager.py @@ -8,6 +8,14 @@ def __init__(self, speaker_file_path=None): def name_to_id(self): return self.speakers.keys() + @property + def num_speakers(self): + return len(self.name_to_id) + + @property + def speaker_names(self): + return list(self.name_to_id.keys()) + class LanguageManager(): def __init__(self, config): @@ -16,3 +24,11 @@ def __init__(self, config): @property def name_to_id(self): return self.langs + + @property + def num_languages(self): + return len(self.name_to_id) + + @property + def language_names(self): + return list(self.name_to_id) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 12ed774269..83812f377f 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -379,7 +379,7 @@ def get_conditioning_latents( return gpt_cond_latents, speaker_embedding - def synthesize(self, text, config, speaker_wav, language, speaker_id, **kwargs): + def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwargs): """Synthesize speech with the given input text. Args: @@ -520,6 +520,8 @@ def inference( ): language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) + gpt_cond_latent = gpt_cond_latent.to(self.device) + speaker_embedding = speaker_embedding.to(self.device) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) else: @@ -628,6 +630,8 @@ def inference_stream( ): language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) + gpt_cond_latent = gpt_cond_latent.to(self.device) + speaker_embedding = speaker_embedding.to(self.device) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) else: From 8c20a599d8d4eac32db2f7b8cd9f9b3d1190b73a Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 22:11:46 +0100 Subject: [PATCH 07/18] Remove coqui studio integration from TTS --- .github/workflows/api_tests.yml | 53 ----- Makefile | 3 - README.md | 29 --- TTS/api.py | 158 +++--------- TTS/bin/synthesize.py | 53 +---- TTS/cs_api.py | 317 ------------------------- TTS/utils/manage.py | 22 -- docs/source/inference.md | 42 ---- tests/api_tests/__init__.py | 0 tests/api_tests/test_python_api.py | 113 --------- tests/api_tests/test_synthesize_api.py | 25 -- 11 files changed, 33 insertions(+), 782 deletions(-) delete mode 100644 .github/workflows/api_tests.yml delete mode 100644 TTS/cs_api.py delete mode 100644 tests/api_tests/__init__.py delete mode 100644 tests/api_tests/test_python_api.py delete mode 100644 tests/api_tests/test_synthesize_api.py diff --git a/.github/workflows/api_tests.yml b/.github/workflows/api_tests.yml deleted file mode 100644 index 5a3baaad9e..0000000000 --- a/.github/workflows/api_tests.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: api_tests - -on: - push: - branches: - - main -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: | - export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make api_tests - env: - COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }} diff --git a/Makefile b/Makefile index 54aa6eeb18..7446848f46 100644 --- a/Makefile +++ b/Makefile @@ -35,9 +35,6 @@ test_zoo: ## run zoo tests. inference_tests: ## run inference tests. nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests -api_tests: ## run api tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.api_tests - data_tests: ## run data tests. nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests diff --git a/README.md b/README.md index ef16c9b6a1..17c362e099 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,6 @@ - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html) - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. - 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html) -- 📣 **Coqui Studio API** is landed on 🐸TTS. - [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api) -- 📣 [**Coqui Studio API**](https://docs.coqui.ai/docs) is live. - 📣 Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice) - 📣 Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin). - 📣 Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin). @@ -253,29 +251,6 @@ tts.tts_with_vc_to_file( ) ``` -#### Example using [🐸Coqui Studio](https://coqui.ai) voices. -You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai). -To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account). -After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable. - -Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list. -These models will follow the naming convention `coqui_studio/en//coqui_studio` - -```python -# XTTS model -models = TTS(cs_api_model="XTTS").list_models() -# Init TTS with the target studio speaker -tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) -# Run TTS -tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH) - -# V1 model -models = TTS(cs_api_model="V1").list_models() -# Run TTS with emotion and speed control -# Emotion control only works with V1 model -tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) -``` - #### Example text to speech using **Fairseq models in ~1100 languages** 🤯. For Fairseq models, use the following name format: `tts_models//fairseq/vits`. You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) @@ -353,10 +328,6 @@ If you don't specify any models, then it uses LJSpeech based English model. - Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: - ``` - $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav - ``` - - Run a TTS model with its default vocoder model: ``` diff --git a/TTS/api.py b/TTS/api.py index b3aa531b7f..7abc188e74 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -6,7 +6,6 @@ import numpy as np from torch import nn -from TTS.cs_api import CS_API from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer @@ -24,7 +23,6 @@ def __init__( vocoder_path: str = None, vocoder_config_path: str = None, progress_bar: bool = True, - cs_api_model: str = "XTTS", gpu=False, ): """🐸TTS python interface that allows to load and use the released models. @@ -60,9 +58,6 @@ def __init__( vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. - cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are - "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control. - Defaults to "XTTS". gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ super().__init__() @@ -70,14 +65,12 @@ def __init__( self.config = load_config(config_path) if config_path else None self.synthesizer = None self.voice_converter = None - self.csapi = None - self.cs_api_model = cs_api_model self.model_name = "" if gpu: warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.") if model_name is not None and len(model_name) > 0: - if "tts_models" in model_name or "coqui_studio" in model_name: + if "tts_models" in model_name: self.load_tts_model_by_name(model_name, gpu) elif "voice_conversion_models" in model_name: self.load_vc_model_by_name(model_name, gpu) @@ -99,12 +92,6 @@ def is_multi_speaker(self): return self.synthesizer.tts_model.speaker_manager.num_speakers > 1 return False - @property - def is_coqui_studio(self): - if self.model_name is None: - return False - return "coqui_studio" in self.model_name - @property def is_multi_lingual(self): # Not sure what sets this to None, but applied a fix to prevent crashing. @@ -136,14 +123,7 @@ def get_models_file_path(): return Path(__file__).parent / ".models.json" def list_models(self): - try: - csapi = CS_API(model=self.cs_api_model) - models = csapi.list_speakers_as_tts_models() - except ValueError as e: - print(e) - models = [] - manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) - return manager.list_tts_models() + models + return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) def download_model_by_name(self, model_name: str): model_path, config_path, model_item = self.manager.download_model(model_name) @@ -186,30 +166,26 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False): TODO: Add tests """ self.synthesizer = None - self.csapi = None self.model_name = model_name - if "coqui_studio" in model_name: - self.csapi = CS_API() - else: - model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( - model_name - ) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( + model_name + ) - # init synthesizer - # None values are fetch from the model - self.synthesizer = Synthesizer( - tts_checkpoint=model_path, - tts_config_path=config_path, - tts_speakers_file=None, - tts_languages_file=None, - vocoder_checkpoint=vocoder_path, - vocoder_config=vocoder_config_path, - encoder_checkpoint=None, - encoder_config=None, - model_dir=model_dir, - use_cuda=gpu, - ) + # init synthesizer + # None values are fetch from the model + self.synthesizer = Synthesizer( + tts_checkpoint=model_path, + tts_config_path=config_path, + tts_speakers_file=None, + tts_languages_file=None, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + encoder_checkpoint=None, + encoder_config=None, + model_dir=model_dir, + use_cuda=gpu, + ) def load_tts_model_by_path( self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False @@ -246,77 +222,17 @@ def _check_arguments( **kwargs, ) -> None: """Check if the arguments are valid for the model.""" - if not self.is_coqui_studio: - # check for the coqui tts models - if self.is_multi_speaker and (speaker is None and speaker_wav is None): - raise ValueError("Model is multi-speaker but no `speaker` is provided.") - if self.is_multi_lingual and language is None: - raise ValueError("Model is multi-lingual but no `language` is provided.") - if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs: - raise ValueError("Model is not multi-speaker but `speaker` is provided.") - if not self.is_multi_lingual and language is not None: - raise ValueError("Model is not multi-lingual but `language` is provided.") - if not emotion is None and not speed is None: - raise ValueError("Emotion and speed can only be used with Coqui Studio models.") - else: - if emotion is None: - emotion = "Neutral" - if speed is None: - speed = 1.0 - # check for the studio models - if speaker_wav is not None: - raise ValueError("Coqui Studio models do not support `speaker_wav` argument.") - if speaker is not None: - raise ValueError("Coqui Studio models do not support `speaker` argument.") - if language is not None and language != "en": - raise ValueError("Coqui Studio models currently support only `language=en` argument.") - if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]: - raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.") - - def tts_coqui_studio( - self, - text: str, - speaker_name: str = None, - language: str = None, - emotion: str = None, - speed: float = 1.0, - pipe_out=None, - file_path: str = None, - ) -> Union[np.ndarray, str]: - """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. - - Args: - text (str): - Input text to synthesize. - speaker_name (str, optional): - Speaker name from Coqui Studio. Defaults to None. - language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS` model. - emotion (str, optional): - Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available - with "V1" model. Defaults to None. - speed (float, optional): - Speed of the speech. Defaults to 1.0. - pipe_out (BytesIO, optional): - Flag to stdout the generated TTS wav file for shell pipe. - file_path (str, optional): - Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. - - Returns: - Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file. - """ - speaker_name = self.model_name.split("/")[2] - if file_path is not None: - return self.csapi.tts_to_file( - text=text, - speaker_name=speaker_name, - language=language, - speed=speed, - pipe_out=pipe_out, - emotion=emotion, - file_path=file_path, - )[0] - return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0] + # check for the coqui tts models + if self.is_multi_speaker and (speaker is None and speaker_wav is None): + raise ValueError("Model is multi-speaker but no `speaker` is provided.") + if self.is_multi_lingual and language is None: + raise ValueError("Model is multi-lingual but no `language` is provided.") + if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs: + raise ValueError("Model is not multi-speaker but `speaker` is provided.") + if not self.is_multi_lingual and language is not None: + raise ValueError("Model is not multi-lingual but `language` is provided.") + if not emotion is None and not speed is None: + raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.") def tts( self, @@ -357,10 +273,6 @@ def tts( self._check_arguments( speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs ) - if self.csapi is not None: - return self.tts_coqui_studio( - text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed - ) wav = self.synthesizer.tts( text=text, speaker_name=speaker, @@ -419,16 +331,6 @@ def tts_to_file( """ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs) - if self.csapi is not None: - return self.tts_coqui_studio( - text=text, - speaker_name=speaker, - language=language, - emotion=emotion, - speed=speed, - file_path=file_path, - pipe_out=pipe_out, - ) wav = self.tts( text=text, speaker=speaker, diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index d9ec3063e6..b125baf7c3 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -66,12 +66,6 @@ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay ``` -- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: - - ``` - $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav - ``` - - Run a TTS model with its default vocoder model: ``` @@ -222,25 +216,6 @@ def main(): default=None, ) parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) - - # args for coqui studio - parser.add_argument( - "--cs_model", - type=str, - help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.", - ) - parser.add_argument( - "--emotion", - type=str, - help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.", - default=None, - ) - parser.add_argument( - "--language", - type=str, - help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.", - default=None, - ) parser.add_argument( "--pipe_out", help="stdout the generated TTS wav file for shell pipe.", @@ -249,13 +224,7 @@ def main(): const=True, default=False, ) - parser.add_argument( - "--speed", - type=float, - help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.", - default=None, - ) - + # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) @@ -389,7 +358,6 @@ def main(): # CASE1 #list : list pre-trained TTS models if args.list_models: - manager.add_cs_api_models(api.list_models()) manager.list_models() sys.exit() @@ -404,21 +372,6 @@ def main(): manager.model_info_by_full_name(model_query_full_name) sys.exit() - # CASE3: TTS with coqui studio models - if "coqui_studio" in args.model_name: - print(" > Using 🐸Coqui Studio model: ", args.model_name) - api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) - api.tts_to_file( - text=args.text, - emotion=args.emotion, - file_path=args.out_path, - language=args.language, - speed=args.speed, - pipe_out=pipe_out, - ) - print(" > Saving output to ", args.out_path) - return - if args.language_idx is None and args.language is not None: msg = ( "--language is only supported for Coqui Studio models. " @@ -426,7 +379,7 @@ def main(): ) raise ValueError(msg) - # CASE4: load pre-trained model paths + # CASE3: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model(args.model_name) # tts model @@ -454,7 +407,7 @@ def main(): if args.vocoder_name is not None and not args.vocoder_path: vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - # CASE5: set custom model paths + # CASE4: set custom model paths if args.model_path is not None: tts_path = args.model_path tts_config_path = args.config_path diff --git a/TTS/cs_api.py b/TTS/cs_api.py deleted file mode 100644 index 9dc6c30dd4..0000000000 --- a/TTS/cs_api.py +++ /dev/null @@ -1,317 +0,0 @@ -import http.client -import json -import os -import tempfile -import urllib.request -from typing import Tuple - -import numpy as np -import requests -from scipy.io import wavfile - -from TTS.utils.audio.numpy_transforms import save_wav - - -class Speaker(object): - """Convert dict to object.""" - - def __init__(self, d, is_voice=False): - self.is_voice = is_voice - for k, v in d.items(): - if isinstance(k, (list, tuple)): - setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v]) - else: - setattr(self, k, Speaker(v) if isinstance(v, dict) else v) - - def __repr__(self): - return str(self.__dict__) - - -class CS_API: - """🐸Coqui Studio API Wrapper. - - 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice - interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different - characteristics. You can use these voices to generate new audio files or use them in your applications. - You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token. - You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from - https://app.coqui.ai/account. We can either enter the token as an environment variable as - `export COQUI_STUDIO_TOKEN=` or pass it as `CS_API(api_token=)`. - Visit https://app.coqui.ai/api for more information. - - - Args: - api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable - `COQUI_STUDIO_TOKEN`. - model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`. - - - Example listing all available speakers: - >>> from TTS.api import CS_API - >>> tts = CS_API() - >>> tts.speakers - - Example listing all emotions: - >>> # emotions are only available for `V1` model - >>> from TTS.api import CS_API - >>> tts = CS_API(model="V1") - >>> tts.emotions - - Example with a built-in 🐸 speaker: - >>> from TTS.api import CS_API - >>> tts = CS_API() - >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name) - >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav") - - Example with multi-language model: - >>> from TTS.api import CS_API - >>> tts = CS_API(model="XTTS") - >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en") - """ - - MODEL_ENDPOINTS = { - "V1": { - "list_speakers": "https://app.coqui.ai/api/v2/speakers", - "synthesize": "https://app.coqui.ai/api/v2/samples", - "list_voices": "https://app.coqui.ai/api/v2/voices", - }, - "XTTS": { - "list_speakers": "https://app.coqui.ai/api/v2/speakers", - "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/", - "list_voices": "https://app.coqui.ai/api/v2/voices/xtts", - }, - } - - SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"] - - def __init__(self, api_token=None, model="XTTS"): - self.api_token = api_token - self.model = model - self.headers = None - self._speakers = None - self._check_token() - - @staticmethod - def ping_api(): - URL = "https://coqui.gateway.scarf.sh/tts/api" - _ = requests.get(URL) - - @property - def speakers(self): - if self._speakers is None: - self._speakers = self.list_all_speakers() - return self._speakers - - @property - def emotions(self): - """Return a list of available emotions. - - TODO: Get this from the API endpoint. - """ - if self.model == "V1": - return ["Neutral", "Happy", "Sad", "Angry", "Dull"] - else: - raise ValueError(f"❗ Emotions are not available for {self.model}.") - - def _check_token(self): - if self.api_token is None: - self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") - self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} - if not self.api_token: - raise ValueError( - "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n" - "Visit 🔗https://app.coqui.ai/account to get one.\n" - "Set it as an environment variable `export COQUI_STUDIO_TOKEN=`\n" - "" - ) - - def list_all_speakers(self): - """Return both built-in Coqui Studio speakers and custom voices created by the user.""" - return self.list_speakers() + self.list_voices() - - def list_speakers(self): - """List built-in Coqui Studio speakers.""" - self._check_token() - conn = http.client.HTTPSConnection("app.coqui.ai") - url = self.MODEL_ENDPOINTS[self.model]["list_speakers"] - conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) - res = conn.getresponse() - data = res.read() - return [Speaker(s) for s in json.loads(data)["result"]] - - def list_voices(self): - """List custom voices created by the user.""" - conn = http.client.HTTPSConnection("app.coqui.ai") - url = self.MODEL_ENDPOINTS[self.model]["list_voices"] - conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) - res = conn.getresponse() - data = res.read() - return [Speaker(s, True) for s in json.loads(data)["result"]] - - def list_speakers_as_tts_models(self): - """List speakers in ModelManager format.""" - models = [] - for speaker in self.speakers: - model = f"coqui_studio/multilingual/{speaker.name}/{self.model}" - models.append(model) - return models - - def name_to_speaker(self, name): - for speaker in self.speakers: - if speaker.name == name: - return speaker - raise ValueError(f"Speaker {name} not found in {self.speakers}") - - def id_to_speaker(self, speaker_id): - for speaker in self.speakers: - if speaker.id == speaker_id: - return speaker - raise ValueError(f"Speaker {speaker_id} not found.") - - @staticmethod - def url_to_np(url): - tmp_file, _ = urllib.request.urlretrieve(url) - rate, data = wavfile.read(tmp_file) - return data, rate - - @staticmethod - def _create_payload(model, text, speaker, speed, emotion, language): - payload = {} - # if speaker.is_voice: - payload["voice_id"] = speaker.id - # else: - payload["speaker_id"] = speaker.id - - if model == "V1": - payload.update( - { - "emotion": emotion, - "name": speaker.name, - "text": text, - "speed": speed, - } - ) - elif model == "XTTS": - payload.update( - { - "name": speaker.name, - "text": text, - "speed": speed, - "language": language, - } - ) - else: - raise ValueError(f"❗ Unknown model {model}") - return payload - - def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language): - assert text is not None, "❗ text is required for V1 model." - assert speaker_name is not None, "❗ speaker_name is required for V1 model." - if self.model == "V1": - if emotion is None: - emotion = "Neutral" - assert language is None, "❗ language is not supported for V1 model." - elif self.model == "XTTS": - assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model." - assert language is not None, "❗ Language is required for XTTS model." - assert ( - language in self.SUPPORTED_LANGUAGES - ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create." - return text, speaker_name, speaker_id, emotion, speed, language - - def tts( - self, - text: str, - speaker_name: str = None, - speaker_id=None, - emotion=None, - speed=1.0, - language=None, # pylint: disable=unused-argument - ) -> Tuple[np.ndarray, int]: - """Synthesize speech from text. - - Args: - text (str): Text to synthesize. - speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and - voices (user generated speakers) with `list_voices()`. - speaker_id (str): Speaker ID. If None, the speaker name is used. - emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only - supported by `V1` model. Defaults to None. - speed (float): Speed of the speech. 1.0 is normal speed. - language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. - """ - self._check_token() - self.ping_api() - - if speaker_name is None and speaker_id is None: - raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") - if speaker_id is None: - speaker = self.name_to_speaker(speaker_name) - else: - speaker = self.id_to_speaker(speaker_id) - - text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args( - text, speaker_name, speaker_id, emotion, speed, language - ) - - conn = http.client.HTTPSConnection("app.coqui.ai") - payload = self._create_payload(self.model, text, speaker, speed, emotion, language) - url = self.MODEL_ENDPOINTS[self.model]["synthesize"] - conn.request("POST", url, json.dumps(payload), self.headers) - res = conn.getresponse() - data = res.read() - try: - wav, sr = self.url_to_np(json.loads(data)["audio_url"]) - except KeyError as e: - raise ValueError(f" [!] 🐸 API returned error: {data}") from e - return wav, sr - - def tts_to_file( - self, - text: str, - speaker_name: str, - speaker_id=None, - emotion=None, - speed=1.0, - pipe_out=None, - language=None, - file_path: str = None, - ) -> str: - """Synthesize speech from text and save it to a file. - - Args: - text (str): Text to synthesize. - speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and - voices (user generated speakers) with `list_voices()`. - speaker_id (str): Speaker ID. If None, the speaker name is used. - emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". - speed (float): Speed of the speech. 1.0 is normal speed. - pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. - language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". - file_path (str): Path to save the file. If None, a temporary file is created. - """ - if file_path is None: - file_path = tempfile.mktemp(".wav") - wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) - save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out) - return file_path - - -if __name__ == "__main__": - import time - - api = CS_API() - print(api.speakers) - print(api.list_speakers_as_tts_models()) - - ts = time.time() - wav, sr = api.tts( - "It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name - ) - print(f" [i] XTTS took {time.time() - ts:.2f}s") - - filepath = api.tts_to_file( - text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav" - ) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index ed7cb2cc07..59dcb58ff0 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -68,28 +68,6 @@ def read_models_file(self, file_path): with open(file_path, "r", encoding="utf-8") as json_file: self.models_dict = json.load(json_file) - def add_cs_api_models(self, model_list: List[str]): - """Add list of Coqui Studio model names that are returned from the api - - Each has the following format `/en//` - """ - - def _add_model(model_name: str): - if not "coqui_studio" in model_name: - return - model_type, lang, dataset, model = model_name.split("/") - if model_type not in self.models_dict: - self.models_dict[model_type] = {} - if lang not in self.models_dict[model_type]: - self.models_dict[model_type][lang] = {} - if dataset not in self.models_dict[model_type][lang]: - self.models_dict[model_type][lang][dataset] = {} - if model not in self.models_dict[model_type][lang][dataset]: - self.models_dict[model_type][lang][dataset][model] = {} - - for model_name in model_list: - _add_model(model_name) - def _list_models(self, model_type, model_count=0): if self.verbose: print("\n Name format: type/language/dataset/model") diff --git a/docs/source/inference.md b/docs/source/inference.md index 611a2445bf..56bccfb5b2 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -172,48 +172,6 @@ tts.tts_with_vc_to_file( ) ``` -#### Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. - -You can use all of your available speakers in the studio. -[🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account). -You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token. - -```python -# If you have a valid API token set you will see the studio speakers as separate models in the list. -# The name format is coqui_studio/en//coqui_studio -models = TTS().list_models() -# Init TTS with the target studio speaker -tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) -# Run TTS -tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) -# Run TTS with emotion and speed control -tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) -``` - -If you just need 🐸 Coqui Studio speakers, you can use `CS_API`. It is a wrapper around the 🐸 Coqui Studio API. - -```python -from TTS.api import CS_API - -# Init 🐸 Coqui Studio API -# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument. - -# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. -api = CS_API(api_token=, model="XTTS") -api.speakers # all the speakers are available with all the models. -api.list_speakers() -api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5) - -# V1 - Fast and lightweight TTS in EN with emotion control. -api = CS_API(api_token=, model="V1") -api.speakers -api.emotions # emotions are only for the V1 model. -api.list_speakers() -api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) -``` - #### Example text to speech using **Fairseq models in ~1100 languages** 🤯. For these models use the following name format: `tts_models//fairseq/vits`. diff --git a/tests/api_tests/__init__.py b/tests/api_tests/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/api_tests/test_python_api.py b/tests/api_tests/test_python_api.py deleted file mode 100644 index 2025fcd9c6..0000000000 --- a/tests/api_tests/test_python_api.py +++ /dev/null @@ -1,113 +0,0 @@ -import os -import unittest - -from tests import get_tests_data_path, get_tests_output_path -from TTS.api import CS_API, TTS - -OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav") -cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav") - - -is_coqui_available = os.environ.get("COQUI_STUDIO_TOKEN") - - -if is_coqui_available: - - class CS_APITest(unittest.TestCase): - def test_speakers(self): - tts = CS_API() - self.assertGreater(len(tts.speakers), 1) - - def test_emotions(self): - tts = CS_API() - self.assertGreater(len(tts.emotions), 1) - - def test_list_calls(self): - tts = CS_API() - self.assertGreater(len(tts.list_voices()), 1) - self.assertGreater(len(tts.list_speakers()), 1) - self.assertGreater(len(tts.list_all_speakers()), 1) - self.assertGreater(len(tts.list_speakers_as_tts_models()), 1) - - def test_name_to_speaker(self): - tts = CS_API() - speaker_name = tts.list_speakers_as_tts_models()[0].split("/")[2] - speaker = tts.name_to_speaker(speaker_name) - self.assertEqual(speaker.name, speaker_name) - - def test_tts(self): - tts = CS_API() - wav, sr = tts.tts(text="This is a test.", speaker_name=tts.list_speakers()[0].name) - self.assertEqual(sr, 44100) - self.assertGreater(len(wav), 1) - - class TTSTest(unittest.TestCase): - def test_single_speaker_model(self): - tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) - - error_raised = False - try: - tts.tts_to_file(text="Ich bin eine Testnachricht.", speaker="Thorsten", language="de") - except ValueError: - error_raised = True - - tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH) - - self.assertTrue(error_raised) - self.assertFalse(tts.is_multi_speaker) - self.assertFalse(tts.is_multi_lingual) - self.assertIsNone(tts.speakers) - self.assertIsNone(tts.languages) - - def test_studio_model(self): - tts = TTS(model_name="coqui_studio/en/Zacharie Aimilios/coqui_studio") - tts.tts_to_file(text="This is a test.") - - # check speed > 2.0 raises error - raised_error = False - try: - _ = tts.tts(text="This is a test.", speed=4.0, emotion="Sad") # should raise error with speed > 2.0 - except ValueError: - raised_error = True - self.assertTrue(raised_error) - - # check emotion is invalid - raised_error = False - try: - _ = tts.tts(text="This is a test.", speed=2.0, emotion="No Emo") # should raise error with speed > 2.0 - except ValueError: - raised_error = True - self.assertTrue(raised_error) - - # check valid call - wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad") - self.assertGreater(len(wav), 0) - - def test_fairseq_model(self): # pylint: disable=no-self-use - tts = TTS(model_name="tts_models/eng/fairseq/vits") - tts.tts_to_file(text="This is a test.") - - def test_multi_speaker_multi_lingual_model(self): - tts = TTS() - tts.load_tts_model_by_name(tts.models[0]) # YourTTS - tts.tts_to_file( - text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path=OUTPUT_PATH - ) - - self.assertTrue(tts.is_multi_speaker) - self.assertTrue(tts.is_multi_lingual) - self.assertGreater(len(tts.speakers), 1) - self.assertGreater(len(tts.languages), 1) - - def test_voice_cloning(self): # pylint: disable=no-self-use - tts = TTS() - tts.load_tts_model_by_name("tts_models/multilingual/multi-dataset/your_tts") - tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH) - - def test_voice_conversion(self): # pylint: disable=no-self-use - tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=False) - tts.voice_conversion_to_file( - source_wav=cloning_test_wav_path, - target_wav=cloning_test_wav_path, - file_path=OUTPUT_PATH, - ) diff --git a/tests/api_tests/test_synthesize_api.py b/tests/api_tests/test_synthesize_api.py deleted file mode 100644 index e7b4f12048..0000000000 --- a/tests/api_tests/test_synthesize_api.py +++ /dev/null @@ -1,25 +0,0 @@ -import os - -from tests import get_tests_output_path, run_cli - - -def test_synthesize(): - """Test synthesize.py with diffent arguments.""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - - # 🐸 Coqui studio model - run_cli( - 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" ' - '--text "This is it" ' - f'--out_path "{output_path}"' - ) - - # 🐸 Coqui studio model with speed arg. - run_cli( - 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" ' - '--text "This is it but slow" --speed 0.1' - f'--out_path "{output_path}"' - ) - - # test pipe_out command - run_cli(f'tts --text "test." --pipe_out --out_path "{output_path}" | aplay') From 5ab228dff2bfe5bf495091d26b58e0ff115df8d3 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 22:31:53 +0100 Subject: [PATCH 08/18] Fix CI --- TTS/bin/synthesize.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index b125baf7c3..b86252ab67 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -372,13 +372,6 @@ def main(): manager.model_info_by_full_name(model_query_full_name) sys.exit() - if args.language_idx is None and args.language is not None: - msg = ( - "--language is only supported for Coqui Studio models. " - "Use --language_idx to specify the target language for multilingual models." - ) - raise ValueError(msg) - # CASE3: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model(args.model_name) From ecc38891fbcee6a25e055db6b98920a742548924 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 23:01:30 +0100 Subject: [PATCH 09/18] Fix CI readme --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 17c362e099..891118c13d 100644 --- a/README.md +++ b/README.md @@ -326,8 +326,6 @@ If you don't specify any models, then it uses LJSpeech based English model. $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay ``` -- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: - - Run a TTS model with its default vocoder model: ``` From b40750d1f5d48456531a6bd4a89948d8ce5d8c6f Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 23:17:54 +0100 Subject: [PATCH 10/18] Remove models that require app.coqui.ai --- TTS/.models.json | 70 ++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 1fee7f87f2..4809616d25 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -40,22 +40,22 @@ "commit": "e9a1953e", "license": "CC BY-NC-ND 4.0", "contact": "egolge@coqui.ai" - }, - "bark": { - "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.", - "hf_url": [ - "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt", - "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt", - "https://app.coqui.ai/tts_model/text_2.pt", - "https://coqui.gateway.scarf.sh/hf/bark/config.json", - "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt", - "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth" - ], - "default_vocoder": null, - "commit": "e9a1953e", - "license": "MIT", - "contact": "https://www.suno.ai/" } + // "bark": { + // "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.", + // "hf_url": [ + // "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt", + // "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt", + // "https://app.coqui.ai/tts_model/text_2.pt", + // "https://coqui.gateway.scarf.sh/hf/bark/config.json", + // "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt", + // "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth" + // ], + // "default_vocoder": null, + // "commit": "e9a1953e", + // "license": "MIT", + // "contact": "https://www.suno.ai/" + // } } }, "bg": { @@ -267,26 +267,26 @@ "contact": "adamfroghyar@gmail.com" } }, - "multi-dataset": { - "tortoise-v2": { - "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts", - "github_rls_url": [ - "https://app.coqui.ai/tts_model/autoregressive.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json" - ], - "commit": "c1875f6", - "default_vocoder": null, - "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi", - "license": "apache 2.0" - } - }, + // "multi-dataset": { + // "tortoise-v2": { + // "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts", + // "github_rls_url": [ + // "https://app.coqui.ai/tts_model/autoregressive.pth", + // "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth", + // "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth", + // "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth", + // "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth", + // "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth", + // "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth", + // "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth", + // "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json" + // ], + // "commit": "c1875f6", + // "default_vocoder": null, + // "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi", + // "license": "apache 2.0" + // } + // }, "jenny": { "jenny": { "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits", From 605a857add865c45f83ca5e0717be2b4da4fa425 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 23:35:07 +0100 Subject: [PATCH 11/18] Remove tortoise --- .github/workflows/zoo_tests_tortoise.yml | 10 ++++++++-- tests/zoo_tests/test_models.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/zoo_tests_tortoise.yml b/.github/workflows/zoo_tests_tortoise.yml index 31442877e1..04ebb0fa58 100644 --- a/.github/workflows/zoo_tests_tortoise.yml +++ b/.github/workflows/zoo_tests_tortoise.yml @@ -8,10 +8,16 @@ on: types: [opened, synchronize, reopened] jobs: check_skip: + # always skip this job while tortoise zoo is not fixed + if: false runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" steps: - - run: echo "${{ github.event.head_commit.message }}" + - name: Check skip + run: echo "Skipping zoo-tests-tortoise" + + # if: "! contains(github.event.head_commit.message, '[ci skip]')" + # steps: + # - run: echo "${{ github.event.head_commit.message }}" test: runs-on: ubuntu-latest diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 8fa56e287a..1c42ffaa49 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -13,7 +13,7 @@ MODELS_WITH_SEP_TESTS = [ "tts_models/multilingual/multi-dataset/bark", - "tts_models/en/multi-dataset/tortoise-v2", + # "tts_models/en/multi-dataset/tortoise-v2", "tts_models/multilingual/multi-dataset/xtts_v1.1", "tts_models/multilingual/multi-dataset/xtts_v2", ] From d47b6df4e50fb37e2c0c586f022fa9c939f71153 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 23:35:27 +0100 Subject: [PATCH 12/18] Make comments in .model.json valid --- TTS/utils/manage.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 59dcb58ff0..3a527f4609 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -11,7 +11,7 @@ import requests from tqdm import tqdm -from TTS.config import load_config +from TTS.config import load_config, read_json_with_comments from TTS.utils.generic_utils import get_user_data_dir LICENSE_URLS = { @@ -65,8 +65,7 @@ def read_models_file(self, file_path): Args: file_path (str): path to .models.json. """ - with open(file_path, "r", encoding="utf-8") as json_file: - self.models_dict = json.load(json_file) + self.models_dict = read_json_with_comments(file_path) def _list_models(self, model_type, model_count=0): if self.verbose: From 61b67ef16ff2bfdc7533f380226c33d7c55105f9 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 11 Dec 2023 23:58:52 +0100 Subject: [PATCH 13/18] Fix read_json_with_comments --- .github/workflows/zoo_tests_tortoise.yml | 58 ------------------------ TTS/config/__init__.py | 9 ++-- 2 files changed, 3 insertions(+), 64 deletions(-) delete mode 100644 .github/workflows/zoo_tests_tortoise.yml diff --git a/.github/workflows/zoo_tests_tortoise.yml b/.github/workflows/zoo_tests_tortoise.yml deleted file mode 100644 index 04ebb0fa58..0000000000 --- a/.github/workflows/zoo_tests_tortoise.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: zoo-tests-tortoise - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - # always skip this job while tortoise zoo is not fixed - if: false - runs-on: ubuntu-latest - steps: - - name: Check skip - run: echo "Skipping zoo-tests-tortoise" - - # if: "! contains(github.event.head_commit.message, '[ci skip]')" - # steps: - # - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - sudo apt-get install espeak espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_tortoise diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index 25b4baef81..c5a6dd68e2 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -16,12 +16,9 @@ def read_json_with_comments(json_path): # fallback to json with fsspec.open(json_path, "r", encoding="utf-8") as f: input_str = f.read() - # handle comments - input_str = re.sub(r"\\\n", "", input_str) - input_str = re.sub(r"//.*\n", "\n", input_str) - data = json.loads(input_str) - return data - + # handle comments but not urls with // + input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str) + return json.loads(input_str) def register_config(model_name: str) -> Coqpit: """Find the right config for the given model name. From b6e1ac66d94958a00c8f116b25128cf273c44de8 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Tue, 12 Dec 2023 09:19:56 -0300 Subject: [PATCH 14/18] Add docs --- docs/source/models/xtts.md | 283 +++++++++++-------------------------- 1 file changed, 79 insertions(+), 204 deletions(-) diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index acb73114b3..69bbc15d7b 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -21,7 +21,7 @@ a few tricks to make it faster and support streaming inference. - Across the board quality improvements. ### Code -Current implementation only supports inference. +Current implementation only supports inference and GPT encoder training. ### Languages As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko). @@ -36,9 +36,71 @@ Come and join in our 🐸Community. We're active on [Discord](https://discord.gg You can also mail us at info@coqui.ai. ### Inference + +#### 🐸TTS Command line + +You can check all supported languages with the following command: + +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --list_language_idx +``` + +You can check all Coqui available speakers with the following command: + +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --list_speaker_idx +``` + +##### Coqui speakers +You can do inference using one of the available speakers using the following command: + +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --text "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." \ + --speaker_idx "Ana Florence" \ + --language_idx en \ + --use_cuda true +``` + +##### Clone a voice +You can clone a speaker voice with a single or multiple references: + +###### Single reference + +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --text "Bugün okula gitmek istemiyorum." \ + --speaker_wav /path/to/target/speaker.wav \ + --language_idx tr \ + --use_cuda true +``` + +###### Multiple references +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --text "Bugün okula gitmek istemiyorum." \ + --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \ + --language_idx tr \ + --use_cuda true +``` +or for all wav files in a directory you can use: + +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --text "Bugün okula gitmek istemiyorum." \ + --speaker_wav /path/to/target/*.wav \ + --language_idx tr \ + --use_cuda true +``` + #### 🐸TTS API -##### Single reference +##### Clone a voice +You can clone a speaker voice with a single or multiple references: + +###### Single reference Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio. You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit. @@ -56,7 +118,7 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t ) ``` -##### Multiple references +###### Multiple references You can pass multiple audio files to the `speaker_wav` argument for better voice cloning. @@ -81,35 +143,24 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t language="en") ``` -#### 🐸TTS Command line +##### Coqui speakers -##### Single reference -```console - tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ - --text "Bugün okula gitmek istemiyorum." \ - --speaker_wav /path/to/target/speaker.wav \ - --language_idx tr \ - --use_cuda true -``` +You can do inference using one of the available speakers using the following code: -##### Multiple references -```console - tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ - --text "Bugün okula gitmek istemiyorum." \ - --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \ - --language_idx tr \ - --use_cuda true -``` -or for all wav files in a directory you can use: +```python +from TTS.api import TTS +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) -```console - tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ - --text "Bugün okula gitmek istemiyorum." \ - --speaker_wav /path/to/target/*.wav \ - --language_idx tr \ - --use_cuda true +# generate speech by cloning a voice using default settings +tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + file_path="output.wav", + speaker="Ana Florence", + language="en", + split_sentences=True + ) ``` + #### 🐸TTS Model API To use the model API, you need to download the model files and pass config and model file paths manually. @@ -157,180 +208,4 @@ model.cuda() print("Computing speaker latents...") gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) -print("Inference...") -out = model.inference( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - temperature=0.7, # Add custom parameters here -) -torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000) -``` - - -##### Streaming manually - -Here the goal is to stream the audio as it is being generated. This is useful for real-time applications. -Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster. - - -```python -import os -import time -import torch -import torchaudio -from TTS.tts.configs.xtts_config import XttsConfig -from TTS.tts.models.xtts import Xtts - -print("Loading model...") -config = XttsConfig() -config.load_json("/path/to/xtts/config.json") -model = Xtts.init_from_config(config) -model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=True) -model.cuda() - -print("Computing speaker latents...") -gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) - -print("Inference...") -t0 = time.time() -chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding -) - -wav_chuncks = [] -for i, chunk in enumerate(chunks): - if i == 0: - print(f"Time to first chunck: {time.time() - t0}") - print(f"Received chunk {i} of audio length {chunk.shape[-1]}") - wav_chuncks.append(chunk) -wav = torch.cat(wav_chuncks, dim=0) -torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000) -``` - - -### Training - -#### Easy training -To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio demo that implements the whole fine-tuning pipeline. The gradio demo enables the user to easily do the following steps: - -- Preprocessing of the uploaded audio or audio files in 🐸 TTS coqui formatter -- Train the XTTS GPT encoder with the processed data -- Inference support using the fine-tuned model - -The user can run this gradio demo locally or remotely using a Colab Notebook. - -##### Run demo on Colab -To make the `XTTS_v2` fine-tuning more accessible for users that do not have good GPUs available we did a Google Colab Notebook. - -The Colab Notebook is available [here](https://colab.research.google.com/drive/1GiI4_X724M8q2W-zZ-jXo7cWTV7RfaH-?usp=sharing). - -To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](). - -If you are not able to acess the video you need to follow the steps: - -1. Open the Colab notebook and start the demo by runining the first two cells (ignore pip install errors in the first one). -2. Click on the link "Running on public URL:" on the second cell output. -3. On the first Tab (1 - Data processing) you need to select the audio file or files, wait for upload, and then click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. -4. Soon as the dataset processing is done you need to go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. Note that it can take up to 40 minutes. -5. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference". - - -##### Run demo locally - -To run the demo locally you need to do the following steps: -1. Install 🐸 TTS following the instructions available [here](https://tts.readthedocs.io/en/dev/installation.html#installation). -2. Install the Gradio demo requirements with the command `python3 -m pip install -r TTS/demos/xtts_ft_demo/requirements.txt` -3. Run the Gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` -4. Follow the steps presented in the [tutorial video](https://www.youtube.com/watch?v=8tpDiiouGxc&feature=youtu.be) to be able to fine-tune and test the fine-tuned model. - - -If you are not able to access the video, here is what you need to do: - -1. On the first Tab (1 - Data processing) select the audio file or files, wait for upload -2. Click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. -3. Go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. it will take some time. -4. Go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. -5. Now you can run inference with the model by clicking on the button "Step 4 - Inference". - -#### Advanced training - -A recipe for `XTTS_v2` GPT encoder training using `LJSpeech` dataset is available at https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech/xtts_v1/train_gpt_xtts.py - -You need to change the fields of the `BaseDatasetConfig` to match your dataset and then update `GPTArgs` and `GPTTrainerConfig` fields as you need. By default, it will use the same parameters that XTTS v1.1 model was trained with. To speed up the model convergence, as default, it will also download the XTTS v1.1 checkpoint and load it. - -After training you can do inference following the code bellow. - -```python -import os -import torch -import torchaudio -from TTS.tts.configs.xtts_config import XttsConfig -from TTS.tts.models.xtts import Xtts - -# Add here the xtts_config path -CONFIG_PATH = "recipes/ljspeech/xtts_v1/run/training/GPT_XTTS_LJSpeech_FT-October-23-2023_10+36AM-653f2e75/config.json" -# Add here the vocab file that you have used to train the model -TOKENIZER_PATH = "recipes/ljspeech/xtts_v1/run/training/XTTS_v2_original_model_files/vocab.json" -# Add here the checkpoint that you want to do inference with -XTTS_CHECKPOINT = "recipes/ljspeech/xtts_v1/run/training/GPT_XTTS_LJSpeech_FT/best_model.pth" -# Add here the speaker reference -SPEAKER_REFERENCE = "LjSpeech_reference.wav" - -# output wav path -OUTPUT_WAV_PATH = "xtts-ft.wav" - -print("Loading model...") -config = XttsConfig() -config.load_json(CONFIG_PATH) -model = Xtts.init_from_config(config) -model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENIZER_PATH, use_deepspeed=False) -model.cuda() - -print("Computing speaker latents...") -gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE]) - -print("Inference...") -out = model.inference( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - temperature=0.7, # Add custom parameters here -) -torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000) -``` - - - -## References and Acknowledgements -- VallE: https://arxiv.org/abs/2301.02111 -- Tortoise Repo: https://github.com/neonbjb/tortoise-tts -- Faster implementation: https://github.com/152334H/tortoise-tts-fast -- Univnet: https://arxiv.org/abs/2106.07889 -- Latent Diffusion:https://arxiv.org/abs/2112.10752 -- DALL-E: https://arxiv.org/abs/2102.12092 -- Perceiver: https://arxiv.org/abs/2103.03206 - - -## XttsConfig -```{eval-rst} -.. autoclass:: TTS.tts.configs.xtts_config.XttsConfig - :members: -``` - -## XttsArgs -```{eval-rst} -.. autoclass:: TTS.tts.models.xtts.XttsArgs - :members: -``` - -## XTTS Model -```{eval-rst} -.. autoclass:: TTS.tts.models.xtts.XTTS - :members: -``` +print("Inference...") \ No newline at end of file From 4b33699b415752748a77de45ae4fa93e7aaf0399 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Tue, 12 Dec 2023 09:22:07 -0300 Subject: [PATCH 15/18] Update docs --- docs/source/models/xtts.md | 182 ++++++++++++++++++++++++++++++++++++- 1 file changed, 179 insertions(+), 3 deletions(-) diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index 69bbc15d7b..b979d04f6e 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -65,7 +65,7 @@ You can do inference using one of the available speakers using the following com ``` ##### Clone a voice -You can clone a speaker voice with a single or multiple references: +You can clone a speaker voice using a single or multiple references: ###### Single reference @@ -98,7 +98,7 @@ or for all wav files in a directory you can use: #### 🐸TTS API ##### Clone a voice -You can clone a speaker voice with a single or multiple references: +You can clone a speaker voice using a single or multiple references: ###### Single reference @@ -208,4 +208,180 @@ model.cuda() print("Computing speaker latents...") gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) -print("Inference...") \ No newline at end of file +print("Inference...") +out = model.inference( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + temperature=0.7, # Add custom parameters here +) +torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000) +``` + + +##### Streaming manually + +Here the goal is to stream the audio as it is being generated. This is useful for real-time applications. +Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster. + + +```python +import os +import time +import torch +import torchaudio +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.models.xtts import Xtts + +print("Loading model...") +config = XttsConfig() +config.load_json("/path/to/xtts/config.json") +model = Xtts.init_from_config(config) +model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=True) +model.cuda() + +print("Computing speaker latents...") +gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) + +print("Inference...") +t0 = time.time() +chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding +) + +wav_chuncks = [] +for i, chunk in enumerate(chunks): + if i == 0: + print(f"Time to first chunck: {time.time() - t0}") + print(f"Received chunk {i} of audio length {chunk.shape[-1]}") + wav_chuncks.append(chunk) +wav = torch.cat(wav_chuncks, dim=0) +torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000) +``` + + +### Training + +#### Easy training +To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio demo that implements the whole fine-tuning pipeline. The gradio demo enables the user to easily do the following steps: + +- Preprocessing of the uploaded audio or audio files in 🐸 TTS coqui formatter +- Train the XTTS GPT encoder with the processed data +- Inference support using the fine-tuned model + +The user can run this gradio demo locally or remotely using a Colab Notebook. + +##### Run demo on Colab +To make the `XTTS_v2` fine-tuning more accessible for users that do not have good GPUs available we did a Google Colab Notebook. + +The Colab Notebook is available [here](https://colab.research.google.com/drive/1GiI4_X724M8q2W-zZ-jXo7cWTV7RfaH-?usp=sharing). + +To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](). + +If you are not able to acess the video you need to follow the steps: + +1. Open the Colab notebook and start the demo by runining the first two cells (ignore pip install errors in the first one). +2. Click on the link "Running on public URL:" on the second cell output. +3. On the first Tab (1 - Data processing) you need to select the audio file or files, wait for upload, and then click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. +4. Soon as the dataset processing is done you need to go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. Note that it can take up to 40 minutes. +5. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference". + + +##### Run demo locally + +To run the demo locally you need to do the following steps: +1. Install 🐸 TTS following the instructions available [here](https://tts.readthedocs.io/en/dev/installation.html#installation). +2. Install the Gradio demo requirements with the command `python3 -m pip install -r TTS/demos/xtts_ft_demo/requirements.txt` +3. Run the Gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` +4. Follow the steps presented in the [tutorial video](https://www.youtube.com/watch?v=8tpDiiouGxc&feature=youtu.be) to be able to fine-tune and test the fine-tuned model. + + +If you are not able to access the video, here is what you need to do: + +1. On the first Tab (1 - Data processing) select the audio file or files, wait for upload +2. Click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. +3. Go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. it will take some time. +4. Go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. +5. Now you can run inference with the model by clicking on the button "Step 4 - Inference". + +#### Advanced training + +A recipe for `XTTS_v2` GPT encoder training using `LJSpeech` dataset is available at https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech/xtts_v1/train_gpt_xtts.py + +You need to change the fields of the `BaseDatasetConfig` to match your dataset and then update `GPTArgs` and `GPTTrainerConfig` fields as you need. By default, it will use the same parameters that XTTS v1.1 model was trained with. To speed up the model convergence, as default, it will also download the XTTS v1.1 checkpoint and load it. + +After training you can do inference following the code bellow. + +```python +import os +import torch +import torchaudio +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.models.xtts import Xtts + +# Add here the xtts_config path +CONFIG_PATH = "recipes/ljspeech/xtts_v1/run/training/GPT_XTTS_LJSpeech_FT-October-23-2023_10+36AM-653f2e75/config.json" +# Add here the vocab file that you have used to train the model +TOKENIZER_PATH = "recipes/ljspeech/xtts_v1/run/training/XTTS_v2_original_model_files/vocab.json" +# Add here the checkpoint that you want to do inference with +XTTS_CHECKPOINT = "recipes/ljspeech/xtts_v1/run/training/GPT_XTTS_LJSpeech_FT/best_model.pth" +# Add here the speaker reference +SPEAKER_REFERENCE = "LjSpeech_reference.wav" + +# output wav path +OUTPUT_WAV_PATH = "xtts-ft.wav" + +print("Loading model...") +config = XttsConfig() +config.load_json(CONFIG_PATH) +model = Xtts.init_from_config(config) +model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENIZER_PATH, use_deepspeed=False) +model.cuda() + +print("Computing speaker latents...") +gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE]) + +print("Inference...") +out = model.inference( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + temperature=0.7, # Add custom parameters here +) +torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000) +``` + + + +## References and Acknowledgements +- VallE: https://arxiv.org/abs/2301.02111 +- Tortoise Repo: https://github.com/neonbjb/tortoise-tts +- Faster implementation: https://github.com/152334H/tortoise-tts-fast +- Univnet: https://arxiv.org/abs/2106.07889 +- Latent Diffusion:https://arxiv.org/abs/2112.10752 +- DALL-E: https://arxiv.org/abs/2102.12092 +- Perceiver: https://arxiv.org/abs/2103.03206 + + +## XttsConfig +```{eval-rst} +.. autoclass:: TTS.tts.configs.xtts_config.XttsConfig + :members: +``` + +## XttsArgs +```{eval-rst} +.. autoclass:: TTS.tts.models.xtts.XttsArgs + :members: +``` + +## XTTS Model +```{eval-rst} +.. autoclass:: TTS.tts.models.xtts.XTTS + :members: +``` From 4dc0722bbc138d2183c47b6548214df0fabd66c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 12 Dec 2023 13:28:16 +0100 Subject: [PATCH 16/18] Update .models.json --- TTS/.models.json | 70 ++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 4809616d25..d7745ffea5 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -41,21 +41,21 @@ "license": "CC BY-NC-ND 4.0", "contact": "egolge@coqui.ai" } - // "bark": { - // "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.", - // "hf_url": [ - // "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt", - // "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt", - // "https://app.coqui.ai/tts_model/text_2.pt", - // "https://coqui.gateway.scarf.sh/hf/bark/config.json", - // "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt", - // "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth" - // ], - // "default_vocoder": null, - // "commit": "e9a1953e", - // "license": "MIT", - // "contact": "https://www.suno.ai/" - // } + "bark": { + "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.", + "hf_url": [ + "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt", + "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt", + "https://coqui.gateway.scarf.sh/hf/text_2.pt", + "https://coqui.gateway.scarf.sh/hf/bark/config.json", + "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt", + "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth" + ], + "default_vocoder": null, + "commit": "e9a1953e", + "license": "MIT", + "contact": "https://www.suno.ai/" + } } }, "bg": { @@ -267,26 +267,26 @@ "contact": "adamfroghyar@gmail.com" } }, - // "multi-dataset": { - // "tortoise-v2": { - // "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts", - // "github_rls_url": [ - // "https://app.coqui.ai/tts_model/autoregressive.pth", - // "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth", - // "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth", - // "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth", - // "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth", - // "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth", - // "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth", - // "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth", - // "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json" - // ], - // "commit": "c1875f6", - // "default_vocoder": null, - // "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi", - // "license": "apache 2.0" - // } - // }, + "multi-dataset": { + "tortoise-v2": { + "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts", + "github_rls_url": [ + "https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json" + ], + "commit": "c1875f6", + "default_vocoder": null, + "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi", + "license": "apache 2.0" + } + }, "jenny": { "jenny": { "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits", From 8999780aff330a6d4f5b0709268a647505e47762 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 12 Dec 2023 13:30:21 +0100 Subject: [PATCH 17/18] Update test_models.py --- tests/zoo_tests/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 1c42ffaa49..8fa56e287a 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -13,7 +13,7 @@ MODELS_WITH_SEP_TESTS = [ "tts_models/multilingual/multi-dataset/bark", - # "tts_models/en/multi-dataset/tortoise-v2", + "tts_models/en/multi-dataset/tortoise-v2", "tts_models/multilingual/multi-dataset/xtts_v1.1", "tts_models/multilingual/multi-dataset/xtts_v2", ] From 8e6a7cbfbf66aaab0b05971d7a2a7a9113a9448e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 12 Dec 2023 13:50:01 +0100 Subject: [PATCH 18/18] Update .models.json --- TTS/.models.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/.models.json b/TTS/.models.json index d7745ffea5..b349e7397b 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -40,7 +40,7 @@ "commit": "e9a1953e", "license": "CC BY-NC-ND 4.0", "contact": "egolge@coqui.ai" - } + }, "bark": { "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.", "hf_url": [