Merge pull request #3173 from coqui-ai/dev

v0.20.2
coqui-ai · Nov 8, 2023 · ab57c36 · ab57c36
2 parents 063556a + 46d9c27
commit ab57c36
Show file tree

Hide file tree

Showing 15 changed files with 383 additions and 437 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 ## 🐸Coqui.ai News
 - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
 - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
-- 📣 ⓍTTS can now stream with <200ms latency. 
+- 📣 ⓍTTS can now stream with <200ms latency.
 - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
 - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
@@ -205,7 +205,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(TTS().list_models())
 
 # Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 
 # Run TTS
 # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models()
 # Init TTS with the target studio speaker
 tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
 # Run TTS
-tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
+tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
 
 # V1 model
 models = TTS(cs_api_model="V1").list_models()
 # Run TTS with emotion and speed control
 # Emotion control only works with V1 model
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
-
-# XTTS-multilingual
-models = TTS(cs_api_model="XTTS-multilingual").list_models()
-# Run TTS with emotion and speed control
-# Emotion control only works with V1 model
-tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
 ```
 
 #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.

diff --git a/TTS/VERSION b/TTS/VERSION
@@ -1 +1 @@
-0.20.1
+0.20.2
diff --git a/TTS/api.py b/TTS/api.py
@@ -60,7 +60,7 @@ def __init__(
  vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
  progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
  cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
- "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
+ "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
  Defaults to "XTTS".
  gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
  """
@@ -275,7 +275,7 @@ def tts_coqui_studio(
  speaker_name (str, optional):
  Speaker name from Coqui Studio. Defaults to None.
  language (str): Language of the text. If None, the default language of the speaker is used. Language is only
- supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+ supported by `XTTS` model.
  emotion (str, optional):
  Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
  with "V1" model. Defaults to None.
@@ -321,7 +321,7 @@ def tts(
  Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
  `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
  language (str): Language of the text. If None, the default language of the speaker is used. Language is only
- supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+ supported by `XTTS` model.
  speaker_wav (str, optional):
  Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
  Defaults to None.

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -227,7 +227,7 @@ def main():
  parser.add_argument(
  "--cs_model",
  type=str,
- help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
+ help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
  )
  parser.add_argument(
  "--emotion",
@@ -238,7 +238,7 @@ def main():
  parser.add_argument(
  "--language",
  type=str,
- help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
+ help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
  default=None,
  )
  parser.add_argument(

diff --git a/TTS/cs_api.py b/TTS/cs_api.py
@@ -43,7 +43,7 @@ class CS_API:
  Args:
  api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
  `COQUI_STUDIO_TOKEN`.
- model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
+ model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
 
 
  Example listing all available speakers:
@@ -65,7 +65,7 @@ class CS_API:
 
  Example with multi-language model:
  >>> from TTS.api import CS_API
- >>> tts = CS_API(model="XTTS-multilang")
+ >>> tts = CS_API(model="XTTS")
  >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
  """
 
@@ -78,16 +78,12 @@ class CS_API:
  "XTTS": {
  "list_speakers": "https://app.coqui.ai/api/v2/speakers",
  "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
- "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
- },
- "XTTS-multilang": {
- "list_speakers": "https://app.coqui.ai/api/v2/speakers",
- "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
- "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
+ "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
  },
  }
 
- SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
+
+ SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
 
  def __init__(self, api_token=None, model="XTTS"):
  self.api_token = api_token
@@ -139,7 +135,7 @@ def list_speakers(self):
  self._check_token()
  conn = http.client.HTTPSConnection("app.coqui.ai")
  url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
- conn.request("GET", f"{url}?per_page=100", headers=self.headers)
+ conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
  res = conn.getresponse()
  data = res.read()
  return [Speaker(s) for s in json.loads(data)["result"]]
@@ -148,7 +144,7 @@ def list_voices(self):
  """List custom voices created by the user."""
  conn = http.client.HTTPSConnection("app.coqui.ai")
  url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
- conn.request("GET", f"{url}", headers=self.headers)
+ conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
  res = conn.getresponse()
  data = res.read()
  return [Speaker(s, True) for s in json.loads(data)["result"]]
@@ -197,14 +193,6 @@ def _create_payload(model, text, speaker, speed, emotion, language):
  }
  )
  elif model == "XTTS":
- payload.update(
- {
- "name": speaker.name,
- "text": text,
- "speed": speed,
- }
- )
- elif model == "XTTS-multilang":
  payload.update(
  {
  "name": speaker.name,
@@ -226,13 +214,10 @@ def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, langua
  assert language is None, "❗ language is not supported for V1 model."
  elif self.model == "XTTS":
  assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
- assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
- elif self.model == "XTTS-multilang":
- assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
- assert language is not None, "❗ Language is required for XTTS-multilang model."
+ assert language is not None, "❗ Language is required for XTTS model."
  assert (
  language in self.SUPPORTED_LANGUAGES
- ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
+ ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
  return text, speaker_name, speaker_id, emotion, speed, language
 
  def tts(
@@ -255,7 +240,7 @@ def tts(
  supported by `V1` model. Defaults to None.
  speed (float): Speed of the speech. 1.0 is normal speed.
  language (str): Language of the text. If None, the default language of the speaker is used. Language is only
- supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+ supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
  """
  self._check_token()
  self.ping_api()
@@ -305,7 +290,7 @@ def tts_to_file(
  speed (float): Speed of the speech. 1.0 is normal speed.
  pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
  language (str): Language of the text. If None, the default language of the speaker is used. Language is only
- supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+ supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
  file_path (str): Path to save the file. If None, a temporary file is created.
  """
  if file_path is None:
@@ -323,20 +308,7 @@ def tts_to_file(
  print(api.list_speakers_as_tts_models())
 
  ts = time.time()
- wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
- print(f" [i] XTTS took {time.time() - ts:.2f}s")
-
- filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
-
- api = CS_API(model="XTTS-multilang")
- print(api.speakers)
-
- ts = time.time()
- wav, sr = api.tts(
- "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
- )
+ wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name)
  print(f" [i] XTTS took {time.time() - ts:.2f}s")
 
- filepath = api.tts_to_file(
- text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
- )
+ filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav")
diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py
@@ -37,29 +37,11 @@ class XttsConfig(BaseTTSConfig):
  If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  Defaults to `0.8`.
 
- cond_free_k (float):
- Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
- As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
- Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
-
- diffusion_temperature (float):
- Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
- are the "mean" prediction of the diffusion network and will sound bland and smeared.
- Defaults to `1.0`.
-
  num_gpt_outputs (int):
  Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
  As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
  Defaults to `16`.
 
- decoder_iterations (int):
- Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
- the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
- however. Defaults to `30`.
-
- decoder_sampler (str):
- Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
-
  gpt_cond_len (int):
  Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.
 
@@ -110,11 +92,7 @@ class XttsConfig(BaseTTSConfig):
  repetition_penalty: float = 2.0
  top_k: int = 50
  top_p: float = 0.85
- cond_free_k: float = 2.0
- diffusion_temperature: float = 1.0
  num_gpt_outputs: int = 1
- decoder_iterations: int = 30
- decoder_sampler: str = "ddim"
 
  # cloning
  gpt_cond_len: int = 3

diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
@@ -8,6 +8,7 @@
 from hangul_romanize.rule import academic
 from num2words import num2words
 from tokenizers import Tokenizer
+from functools import cached_property
 
 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
 
@@ -535,11 +536,50 @@ def korean_cleaners(text):
 class VoiceBpeTokenizer:
  def __init__(self, vocab_file=None):
  self.tokenizer = None
- self.katsu = None
  if vocab_file is not None:
  self.tokenizer = Tokenizer.from_file(vocab_file)
+ self.char_limits = {
+ "en": 250,
+ "de": 253,
+ "fr": 273,
+ "es": 239,
+ "it": 213,
+ "pt": 203,
+ "pl": 224,
+ "zh-cn": 82,
+ "ar": 166,
+ "cs": 186,
+ "ru": 182,
+ "nl": 251,
+ "tr": 226,
+ "ja": 71,
+ "hu": 224,
+ "ko": 95,
+ }
+
+ @cached_property
+ def katsu(self):
+ import cutlet
+ return cutlet.Cutlet()
+
+ def check_input_length(self, txt, lang):
+ limit = self.char_limits.get(lang, 250)
+ if len(txt) > limit:
+ print(f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio.")
+
+ def preprocess_text(self, txt, lang):
+ if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
+ txt = multilingual_cleaners(txt, lang)
+ if lang == "zh-cn":
+ txt = chinese_transliterate(txt)
+ elif lang == "ja": 
+ txt = japanese_cleaners(txt, self.katsu)
+ else:
+ raise NotImplementedError()
+ return txt
 
  def encode(self, txt, lang):
+ self.check_input_length(txt, lang)
  txt = self.preprocess_text(txt, lang)
  txt = f"[{lang}]{txt}"
  txt = txt.replace(" ", "[SPACE]")