Add stdout option (#3027)

* add add cli options for play and speed --play argument uses simpleaudio to play the tts wav --speed <float 0.0-2.0> passes speed argument to Coqui Studio models * remove simpleaudio not referenced in file * fix simpleaudio dependency version * add ALSA headers for simpleaudio compilation * Dockerfile ALSA headers for simpleaudio * base changes to use stdout instead of play audio Considering conversion to pipe wav data for audio playback with ohter program like aplay. This is incomplete code. Using to get feedback before proceeding with implementation. * remove play for pipe_out arg that suppresses stdout removed play and simpleaudio dependency in place of pipe fuctionality to allow passing wav file data to a program dedicated to playing audio. * scipy.io.wavfile.write fails with /dev/null target * Streaming inference for XTTS 🚀 (#3035) * v0.17.7 * Redownload XTTS with the local and remote config do not match * Remove unused method * Print a message when it is already donwloaded * Try-except to present error when the user dont have connection * Fix style * 0.17.8 * v0.17.8 --------- Co-authored-by: Julian Weber <julian.weber@hotmail.fr> Co-authored-by: Eren Gölge <erogol@hotmail.com> Co-authored-by: Edresson Casanova <edresson1@gmail.com> Co-authored-by: ggoknar <ggoknar@coqui.ai>
coqui-ai · Oct 16, 2023 · a151d70 · a151d70
1 parent cae185f
commit a151d70
Show file tree

Hide file tree

Showing 8 changed files with 267 additions and 171 deletions.
diff --git a/README.md b/README.md
@@ -347,6 +347,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
  $ tts --text "Text for TTS" --out_path output/path/speech.wav
  ```
 
+- Run TTS and pipe out the generated TTS wav file data:
+
+ ```
+ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+ ```
+
+- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
+ ```
+
 - Run a TTS model with its default vocoder model:
 
  ```

diff --git a/TTS/api.py b/TTS/api.py
@@ -112,7 +112,6 @@ def is_multi_lingual(self):
  return self.synthesizer.tts_model.language_manager.num_languages > 1
  return False
 
-
  @property
  def speakers(self):
  if not self.is_multi_speaker:
@@ -265,6 +264,7 @@ def tts_coqui_studio(
  language: str = None,
  emotion: str = None,
  speed: float = 1.0,
+ pipe_out = None,
  file_path: str = None,
  ) -> Union[np.ndarray, str]:
  """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
@@ -281,6 +281,8 @@ def tts_coqui_studio(
  with "V1" model. Defaults to None.
  speed (float, optional):
  Speed of the speech. Defaults to 1.0.
+ pipe_out (BytesIO, optional):
+ Flag to stdout the generated TTS wav file for shell pipe.
  file_path (str, optional):
  Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
 
@@ -294,6 +296,7 @@ def tts_coqui_studio(
  speaker_name=speaker_name,
  language=language,
  speed=speed,
+ pipe_out=pipe_out,
  emotion=emotion,
  file_path=file_path,
  )[0]
@@ -356,6 +359,7 @@ def tts_to_file(
  speaker_wav: str = None,
  emotion: str = None,
  speed: float = 1.0,
+ pipe_out = None,
  file_path: str = "output.wav",
  **kwargs,
  ):
@@ -377,6 +381,8 @@ def tts_to_file(
  Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
  speed (float, optional):
  Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
+ pipe_out (BytesIO, optional):
+ Flag to stdout the generated TTS wav file for shell pipe.
  file_path (str, optional):
  Output file path. Defaults to "output.wav".
  kwargs (dict, optional):
@@ -386,10 +392,16 @@ def tts_to_file(
 
  if self.csapi is not None:
  return self.tts_coqui_studio(
- text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
+ text=text,
+ speaker_name=speaker,
+ language=language,
+ emotion=emotion,
+ speed=speed,
+ file_path=file_path,
+ pipe_out=pipe_out,
  )
  wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
- self.synthesizer.save_wav(wav=wav, path=file_path)
+ self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
  return file_path
 
  def voice_conversion(