diff --git a/README.md b/README.md index ad4a90b9a7..4f386ecdc2 100644 --- a/README.md +++ b/README.md @@ -347,6 +347,18 @@ If you don't specify any models, then it uses LJSpeech based English model. $ tts --text "Text for TTS" --out_path output/path/speech.wav ``` +- Run TTS and pipe out the generated TTS wav file data: + + ``` + $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay + ``` + +- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: + + ``` + $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav + ``` + - Run a TTS model with its default vocoder model: ``` diff --git a/TTS/api.py b/TTS/api.py index e1d167a997..dd5820f8a4 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -112,7 +112,6 @@ def is_multi_lingual(self): return self.synthesizer.tts_model.language_manager.num_languages > 1 return False - @property def speakers(self): if not self.is_multi_speaker: @@ -265,6 +264,7 @@ def tts_coqui_studio( language: str = None, emotion: str = None, speed: float = 1.0, + pipe_out = None, file_path: str = None, ) -> Union[np.ndarray, str]: """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. @@ -281,6 +281,8 @@ def tts_coqui_studio( with "V1" model. Defaults to None. speed (float, optional): Speed of the speech. Defaults to 1.0. + pipe_out (BytesIO, optional): + Flag to stdout the generated TTS wav file for shell pipe. file_path (str, optional): Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. @@ -294,6 +296,7 @@ def tts_coqui_studio( speaker_name=speaker_name, language=language, speed=speed, + pipe_out=pipe_out, emotion=emotion, file_path=file_path, )[0] @@ -356,6 +359,7 @@ def tts_to_file( speaker_wav: str = None, emotion: str = None, speed: float = 1.0, + pipe_out = None, file_path: str = "output.wav", **kwargs, ): @@ -377,6 +381,8 @@ def tts_to_file( Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral". speed (float, optional): Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None. + pipe_out (BytesIO, optional): + Flag to stdout the generated TTS wav file for shell pipe. file_path (str, optional): Output file path. Defaults to "output.wav". kwargs (dict, optional): @@ -386,10 +392,16 @@ def tts_to_file( if self.csapi is not None: return self.tts_coqui_studio( - text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path + text=text, + speaker_name=speaker, + language=language, + emotion=emotion, + speed=speed, + file_path=file_path, + pipe_out=pipe_out, ) wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs) - self.synthesizer.save_wav(wav=wav, path=file_path) + self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) return file_path def voice_conversion( diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 5ff1181f4e..78a20c2566 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import argparse +import contextlib import sys from argparse import RawTextHelpFormatter @@ -59,6 +60,18 @@ $ tts --text "Text for TTS" --out_path output/path/speech.wav ``` +- Run TTS and pipe out the generated TTS wav file data: + + ``` + $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay + ``` + +- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: + + ``` + $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav + ``` + - Run a TTS model with its default vocoder model: ``` @@ -228,6 +241,20 @@ def main(): help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", default=None, ) + parser.add_argument( + "--pipe_out", + help="stdout the generated TTS wav file for shell pipe.", + type=str2bool, + nargs="?", + const=True, + default=False, + ) + parser.add_argument( + "--speed", + type=float, + help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.", + default=None, + ) # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) @@ -335,167 +362,177 @@ def main(): if not any(check_args): parser.parse_args(["-h"]) - # Late-import to make things load faster - from TTS.api import TTS - from TTS.utils.manage import ModelManager - from TTS.utils.synthesizer import Synthesizer - - # load model manager - path = Path(__file__).parent / "../.models.json" - manager = ModelManager(path, progress_bar=args.progress_bar) - api = TTS() - - tts_path = None - tts_config_path = None - speakers_file_path = None - language_ids_file_path = None - vocoder_path = None - vocoder_config_path = None - encoder_path = None - encoder_config_path = None - vc_path = None - vc_config_path = None - model_dir = None - - # CASE1 #list : list pre-trained TTS models - if args.list_models: - manager.add_cs_api_models(api.list_models()) - manager.list_models() - sys.exit() - - # CASE2 #info : model info for pre-trained TTS models - if args.model_info_by_idx: - model_query = args.model_info_by_idx - manager.model_info_by_idx(model_query) - sys.exit() - - if args.model_info_by_name: - model_query_full_name = args.model_info_by_name - manager.model_info_by_full_name(model_query_full_name) - sys.exit() - - # CASE3: TTS with coqui studio models - if "coqui_studio" in args.model_name: - print(" > Using 🐸Coqui Studio model: ", args.model_name) - api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) - api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path, language=args.language) - print(" > Saving output to ", args.out_path) - return - - # CASE4: load pre-trained model paths - if args.model_name is not None and not args.model_path: - model_path, config_path, model_item = manager.download_model(args.model_name) - # tts model - if model_item["model_type"] == "tts_models": - tts_path = model_path - tts_config_path = config_path - if "default_vocoder" in model_item: - args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name - - # voice conversion model - if model_item["model_type"] == "voice_conversion_models": - vc_path = model_path - vc_config_path = config_path - - # tts model with multiple files to be loaded from the directory path - if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): - model_dir = model_path - tts_path = None - tts_config_path = None - args.vocoder_name = None - - # load vocoder - if args.vocoder_name is not None and not args.vocoder_path: - vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - - # CASE5: set custom model paths - if args.model_path is not None: - tts_path = args.model_path - tts_config_path = args.config_path - speakers_file_path = args.speakers_file_path - language_ids_file_path = args.language_ids_file_path - - if args.vocoder_path is not None: - vocoder_path = args.vocoder_path - vocoder_config_path = args.vocoder_config_path - - if args.encoder_path is not None: - encoder_path = args.encoder_path - encoder_config_path = args.encoder_config_path - - device = args.device - if args.use_cuda: - device = "cuda" - - # load models - synthesizer = Synthesizer( - tts_path, - tts_config_path, - speakers_file_path, - language_ids_file_path, - vocoder_path, - vocoder_config_path, - encoder_path, - encoder_config_path, - vc_path, - vc_config_path, - model_dir, - args.voice_dir, - ).to(device) - - # query speaker ids of a multi-speaker model. - if args.list_speaker_idxs: - print( - " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." - ) - print(synthesizer.tts_model.speaker_manager.name_to_id) - return - - # query langauge ids of a multi-lingual model. - if args.list_language_idxs: - print( - " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." - ) - print(synthesizer.tts_model.language_manager.name_to_id) - return - - # check the arguments against a multi-speaker model. - if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): - print( - " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " - "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." - ) - return - - # RUN THE SYNTHESIS - if args.text: - print(" > Text: {}".format(args.text)) - - # kick it - if tts_path is not None: - wav = synthesizer.tts( - args.text, - speaker_name=args.speaker_idx, - language_name=args.language_idx, - speaker_wav=args.speaker_wav, - reference_wav=args.reference_wav, - style_wav=args.capacitron_style_wav, - style_text=args.capacitron_style_text, - reference_speaker_name=args.reference_speaker_idx, - ) - elif vc_path is not None: - wav = synthesizer.voice_conversion( - source_wav=args.source_wav, - target_wav=args.target_wav, - ) - elif model_dir is not None: - wav = synthesizer.tts( - args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav - ) - - # save the results - print(" > Saving output to {}".format(args.out_path)) - synthesizer.save_wav(wav, args.out_path) + pipe_out = sys.stdout if args.pipe_out else None + + with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout): + # Late-import to make things load faster + from TTS.api import TTS + from TTS.utils.manage import ModelManager + from TTS.utils.synthesizer import Synthesizer + + # load model manager + path = Path(__file__).parent / "../.models.json" + manager = ModelManager(path, progress_bar=args.progress_bar) + api = TTS() + + tts_path = None + tts_config_path = None + speakers_file_path = None + language_ids_file_path = None + vocoder_path = None + vocoder_config_path = None + encoder_path = None + encoder_config_path = None + vc_path = None + vc_config_path = None + model_dir = None + + # CASE1 #list : list pre-trained TTS models + if args.list_models: + manager.add_cs_api_models(api.list_models()) + manager.list_models() + sys.exit() + + # CASE2 #info : model info for pre-trained TTS models + if args.model_info_by_idx: + model_query = args.model_info_by_idx + manager.model_info_by_idx(model_query) + sys.exit() + + if args.model_info_by_name: + model_query_full_name = args.model_info_by_name + manager.model_info_by_full_name(model_query_full_name) + sys.exit() + + # CASE3: TTS with coqui studio models + if "coqui_studio" in args.model_name: + print(" > Using 🐸Coqui Studio model: ", args.model_name) + api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) + api.tts_to_file( + text=args.text, + emotion=args.emotion, + file_path=args.out_path, + language=args.language, + speed=args.speed, + pipe_out=pipe_out, + ) + print(" > Saving output to ", args.out_path) + return + + # CASE4: load pre-trained model paths + if args.model_name is not None and not args.model_path: + model_path, config_path, model_item = manager.download_model(args.model_name) + # tts model + if model_item["model_type"] == "tts_models": + tts_path = model_path + tts_config_path = config_path + if "default_vocoder" in model_item: + args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name + + # voice conversion model + if model_item["model_type"] == "voice_conversion_models": + vc_path = model_path + vc_config_path = config_path + + # tts model with multiple files to be loaded from the directory path + if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): + model_dir = model_path + tts_path = None + tts_config_path = None + args.vocoder_name = None + + # load vocoder + if args.vocoder_name is not None and not args.vocoder_path: + vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) + + # CASE5: set custom model paths + if args.model_path is not None: + tts_path = args.model_path + tts_config_path = args.config_path + speakers_file_path = args.speakers_file_path + language_ids_file_path = args.language_ids_file_path + + if args.vocoder_path is not None: + vocoder_path = args.vocoder_path + vocoder_config_path = args.vocoder_config_path + + if args.encoder_path is not None: + encoder_path = args.encoder_path + encoder_config_path = args.encoder_config_path + + device = args.device + if args.use_cuda: + device = "cuda" + + # load models + synthesizer = Synthesizer( + tts_path, + tts_config_path, + speakers_file_path, + language_ids_file_path, + vocoder_path, + vocoder_config_path, + encoder_path, + encoder_config_path, + vc_path, + vc_config_path, + model_dir, + args.voice_dir, + ).to(device) + + # query speaker ids of a multi-speaker model. + if args.list_speaker_idxs: + print( + " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." + ) + print(synthesizer.tts_model.speaker_manager.name_to_id) + return + + # query langauge ids of a multi-lingual model. + if args.list_language_idxs: + print( + " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." + ) + print(synthesizer.tts_model.language_manager.name_to_id) + return + + # check the arguments against a multi-speaker model. + if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): + print( + " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " + "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." + ) + return + + # RUN THE SYNTHESIS + if args.text: + print(" > Text: {}".format(args.text)) + + # kick it + if tts_path is not None: + wav = synthesizer.tts( + args.text, + speaker_name=args.speaker_idx, + language_name=args.language_idx, + speaker_wav=args.speaker_wav, + reference_wav=args.reference_wav, + style_wav=args.capacitron_style_wav, + style_text=args.capacitron_style_text, + reference_speaker_name=args.reference_speaker_idx, + ) + elif vc_path is not None: + wav = synthesizer.voice_conversion( + source_wav=args.source_wav, + target_wav=args.target_wav, + ) + elif model_dir is not None: + wav = synthesizer.tts( + args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav + ) + + # save the results + print(" > Saving output to {}".format(args.out_path)) + synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out) if __name__ == "__main__": diff --git a/TTS/cs_api.py b/TTS/cs_api.py index a36452abc9..4a44b535fd 100644 --- a/TTS/cs_api.py +++ b/TTS/cs_api.py @@ -9,6 +9,8 @@ import requests from scipy.io import wavfile +from TTS.utils.audio.numpy_transforms import save_wav + class Speaker(object): """Convert dict to object.""" @@ -288,6 +290,7 @@ def tts_to_file( speaker_id=None, emotion=None, speed=1.0, + pipe_out=None, language=None, file_path: str = None, ) -> str: @@ -300,6 +303,7 @@ def tts_to_file( speaker_id (str): Speaker ID. If None, the speaker name is used. emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". speed (float): Speed of the speech. 1.0 is normal speed. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. language (str): Language of the text. If None, the default language of the speaker is used. Language is only supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". file_path (str): Path to save the file. If None, a temporary file is created. @@ -307,7 +311,7 @@ def tts_to_file( if file_path is None: file_path = tempfile.mktemp(".wav") wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) - wavfile.write(file_path, sr, wav) + save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out) return file_path diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index ae44472f05..e2b71fb2fe 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -1,3 +1,4 @@ +from io import BytesIO from typing import Tuple import librosa @@ -427,16 +428,24 @@ def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, return x -def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, **kwargs) -> None: +def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out = None, **kwargs) -> None: """Save float waveform to a file using Scipy. Args: wav (np.ndarray): Waveform with float values in range [-1, 1] to save. path (str): Path to a output file. sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, sample_rate, wav_norm.astype(np.int16)) + + wav_norm = wav_norm.astype(np.int16) + if pipe_out: + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sample_rate, wav_norm) + wav_buffer.seek(0) + pipe_out.buffer.write(wav_buffer.read()) + scipy.io.wavfile.write(path, sample_rate, wav_norm) def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray: diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index b0920dc9eb..248e15b888 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -1,3 +1,4 @@ +from io import BytesIO from typing import Dict, Tuple import librosa @@ -693,20 +694,27 @@ def load_wav(self, filename: str, sr: int = None) -> np.ndarray: x = self.rms_volume_norm(x, self.db_level) return x - def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: + def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out = None) -> None: """Save a waveform to a file using Scipy. Args: wav (np.ndarray): Waveform to save. path (str): Path to a output file. sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ if self.do_rms_norm: wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767 else: wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16)) + wav_norm = wav_norm.astype(np.int16) + if pipe_out: + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm) + wav_buffer.seek(0) + pipe_out.buffer.write(wav_buffer.read()) + scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm) def get_duration(self, filename: str) -> float: """Get the duration of a wav file using Librosa. diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2e2e40e2cd..a7370cd2c9 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -235,19 +235,20 @@ def split_into_sentences(self, text) -> List[str]: """ return self.seg.segment(text) - def save_wav(self, wav: List[int], path: str) -> None: + def save_wav(self, wav: List[int], path: str, pipe_out = None) -> None: """Save the waveform as a file. Args: wav (List[int]): waveform as a list of values. path (str): output path to save the waveform. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ # if tensor convert to numpy if torch.is_tensor(wav): wav = wav.cpu().numpy() if isinstance(wav, list): wav = np.array(wav) - save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate) + save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out) def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]: output_wav = self.vc_model.voice_conversion(source_wav, target_wav) diff --git a/tests/api_tests/test_synthesize_api.py b/tests/api_tests/test_synthesize_api.py index a96c8beab6..084f81d489 100644 --- a/tests/api_tests/test_synthesize_api.py +++ b/tests/api_tests/test_synthesize_api.py @@ -13,3 +13,16 @@ def test_synthesize(): '--text "This is it" ' f'--out_path "{output_path}"' ) + + # 🐸 Coqui studio model with speed arg. + run_cli( + 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" ' + '--text "This is it but slow" --speed 0.1' + f'--out_path "{output_path}"' + ) + + # test pipe_out command + run_cli( + 'tts --text "test." --pipe_out ' + f'--out_path "{output_path}" | aplay' + )