diff --git a/examples/gui_file_to_text_to_audio_playback.py b/examples/gui_file_to_text_to_audio_playback.py index 8eef91b..258bb83 100644 --- a/examples/gui_file_to_text_to_audio_playback.py +++ b/examples/gui_file_to_text_to_audio_playback.py @@ -13,7 +13,7 @@ (3) run pip3 install WhisperSpeech -(4) pip3 install soundfile==0.12.1 sounddevice==0.4.6 pypdf==4.0.2 python-docx==1.1.0 nltk==3.8.1 +(4) pip3 install sounddevice==0.4.6 pypdf==4.0.2 python-docx==1.1.0 nltk==3.8.1 (9) python gui_file_to_text_to_audio_playback.py ''' diff --git a/examples/gui_text_to_audio_playback.py b/examples/gui_text_to_audio_playback.py index 3f2549b..2a44015 100644 --- a/examples/gui_text_to_audio_playback.py +++ b/examples/gui_text_to_audio_playback.py @@ -12,7 +12,7 @@ https://pytorch.org/get-started/locally/ (3) pip3 install WhisperSpeech -(4) pip3 install soundfile==0.12.1 sounddevice==0.4.6 +(4) pip3 install sounddevice==0.4.6 (5) python gui_text_to_audio_playback.py ''' diff --git a/examples/minimal.py b/examples/minimal.py index 67a1b7c..b9dc99b 100644 --- a/examples/minimal.py +++ b/examples/minimal.py @@ -1,6 +1,8 @@ from whisperspeech.pipeline import Pipeline -tts_pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model') +tts_pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model') # uncomment the line for the model you want to use +# tts_pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-base-en+pl.model') # uncomment the line for the model you want to use +# tts_pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-small-en+pl.model') # uncomment the line for the model you want to use -save_path = 'output.wav' -tts_pipe.generate_to_file(save_path, "This is a test") \ No newline at end of file +save_path = 'output.wav' # change the file extension to .mp3, .flac, .ogg etc. to save to a different file format +tts_pipe.generate_to_file(save_path, "This is a test") diff --git a/examples/readme.md b/examples/readme.md index df46b5a..f467e23 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -1 +1,33 @@ -This folder contains examples of basic usage of the WhisperSpeech library. +# Example Scripts + +Contributions are welcome! Feel free to create an issue or pull request on GitHub. + +### `minimal.py` + +- Minimalistic script that takes hardcoded text input and outputs an audio file. + +### `text_to_playback.py` + +- Utilizes the new `generate_to_playback` method to directly convert hardcoded text to audio playback without intermediate steps. Designed for minimal script length, but does not include queue management to reduce latency. + +### `text_to_audio_playback.py` + +- Processes text one sentence at a time and adds them to a queue for playback. Designed for users who prefer a command-line approach but still want the efficiency of queued playback. + +### `gui_file_to_text_to_audio_playback.py` + +- Provides a graphical user interface allowing users to load a file. The text is then converted into speech, sentence by sentence using queue management in order to reduce latency. + +### `gui_text_to_audio_playback.py` + +- Similar to `gui_file_to_text_to_audio_playback.py`, but a user simply enters the text to be played back. Text is still processed one sentence at a time for low latency. + + +| Feature | gui_file_to...
audio_playback.py | gui_text_to...
audio_playback.py | minimal.py | text_to_audio...
playback.py | text_to_playback.py | +|:---------------------------------:|:-----------------------------------:|:-----------------------------------:|:----------:|:-------------------------------:|:-------------------:| +| **GUI** |
|
|
|
|
| +| **Input** | File | Text Entry | Predefined Text | Predefined Text | Predefined Text | +| **Output** | Audio Playback | Audio Playback | WAV File | Audio Playback | Audio Playback | +| **Queue Management** |
|
|
|
|
| +| **Text-to-Speech
Conversion**|
|
|
|
|
| +| **Load File** |
|
|
|
|
| diff --git a/examples/text_to_audio_file.py b/examples/text_to_audio_file.py deleted file mode 100644 index b0f624e..0000000 --- a/examples/text_to_audio_file.py +++ /dev/null @@ -1,50 +0,0 @@ -''' -Simple example that speaks some text and creates an audio file named output_audio.wav in the same directory in which the script is run. - -INSTALLATION INSTRUCTIONS~ -**Tested on Windows - -(1) create a virtual environment and activate it -(2) install pytorch by going to the following website and running the appropriate command for your platform and setup: - -https://pytorch.org/get-started/locally/ - -(3) pip3 install WhisperSpeech -(4) pip3 install soundfile==0.12.1 pydub==0.25.1 -(5) python text_to_audio_file.py -''' - -from pydub import AudioSegment -import numpy as np -from whisperspeech.pipeline import Pipeline - -# pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-small-en+pl.model') -# pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model') -pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-base-en+pl.model') - -audio_tensor = pipe.generate(""" - This is some sample text. You would add text here that you want spoken and then only leave one of the above lines uncommented for the model you want to test. Note that this script does not rely on the standard method within the whisperspeech pipeline. Rather, it replaces a part of the functionality with reliance on pydub instead. This approach "just worked." Feel free to modify or distribute at your pleasure. -""") - -# generate uses CUDA if available; therefore, it's necessary to move to CPU before converting to NumPy array -audio_np = (audio_tensor.cpu().numpy() * 32767).astype(np.int16) - -if len(audio_np.shape) == 1: - audio_np = np.expand_dims(audio_np, axis=0) -else: - audio_np = audio_np.T - -print("Array shape:", audio_np.shape) -print("Array dtype:", audio_np.dtype) - -try: - audio_segment = AudioSegment( - audio_np.tobytes(), - frame_rate=24000, - sample_width=2, - channels=1 - ) - audio_segment.export('output_audio.wav', format='wav') - print("Audio file generated: output_audio.wav") -except Exception as e: - print(f"Error writing audio file: {e}") \ No newline at end of file diff --git a/examples/text_to_audio_playback.py b/examples/text_to_audio_playback.py index 35a4c79..de58024 100644 --- a/examples/text_to_audio_playback.py +++ b/examples/text_to_audio_playback.py @@ -11,7 +11,7 @@ https://pytorch.org/get-started/locally/ (3) pip3 install WhisperSpeech -(4) pip3 install soundfile==0.12.1 sounddevice==0.4.6 +(4) pip3 install sounddevice==0.4.6 (5) python gui_text_to_audio_playback.py ''' diff --git a/examples/text_to_playback.py b/examples/text_to_playback.py new file mode 100644 index 0000000..563ebdd --- /dev/null +++ b/examples/text_to_playback.py @@ -0,0 +1,33 @@ +''' +DESCRIPTION~ + +Processes a body of text directly into audio playback using the sounddevice library. + +PLEASE NOTE~ + +If you need more granular control, such as being able to process sentences in one thread (one sentence at a time) while simultaneously playing them in another thread (reducing latency), consult the "text_to_audio_playback.py" example. It uses the "generate" method in conjunction with the "sounddevice" library directly. + +This example uses the "genereate_to_playback" method instead, which is good for reducing the length of your script, especially with shorter passages where latency is not as important. + +INSTALLATION INSTRUCTIONS~ + +(1) create a virtual environment and activate it +(2) install pytorch by going to the following website and running the appropriate command for your platform and setup: + +https://pytorch.org/get-started/locally/ +---This script has been tested up to Torch 2.2.0. + +(3) pip3 install WhisperSpeech +(4) pip3 install sounddevice==0.4.6 +(5) python text_to_playback.py +''' + +from whisperspeech.pipeline import Pipeline + +# pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-small-en+pl.model') +# pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model') +pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-base-en+pl.model') + +pipe.generate_to_playback(""" + This is some sample text. You would add text here that you want spoken and then only leave one of the above lines uncommented for the model you want to test. This text is being used to test a new generate to playback method within the pipeline script. It would require adding sounddevice as a dependency since that's what performs the playback. +""") \ No newline at end of file