erew123 · SilyNoMeta · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
diff --git a/script.py b/script.py
@@ -3309,6 +3309,7 @@ def on_load(request: gr.Request):
                         gen_lang = gr.Dropdown(
                             value=config.api_def.api_language,
                             choices=[
+                                "auto",
                                 "ar",
                                 "zh",
                                 "cs",

diff --git a/system/openaittstest.html b/system/openaittstest.html
@@ -100,10 +100,13 @@ <h1>OpenAI API/AllTalk TTS API Test</h1>
                     <label for="voice">Voice:</label>
                     <select id="voice" name="voice" required>
                         <option value="alloy">Alloy</option>
+                        <option value="ash">Ash</option>
+                        <option value="Coral">Coral</option>
                         <option value="echo">Echo</option>
                         <option value="fable">Fable</option>
                         <option value="nova">Nova</option>
                         <option value="onyx">Onyx</option>
+                        <option value="sage">Sage</option>
                         <option value="shimmer">Shimmer</option>
                     </select>
                 </div>

diff --git a/system/requirements/requirements_colab.txt b/system/requirements/requirements_colab.txt
@@ -46,3 +46,4 @@ piper-tts; sys_platform == "linux"
 plotly==5.24.1
 scipy==1.14.1
 pyOpenSSL>=24.2.1
+langdetect>=1.0.9
diff --git a/system/requirements/requirements_standalone.txt b/system/requirements/requirements_standalone.txt
@@ -36,3 +36,4 @@ fastapi==0.112.2
 plotly==5.24.1
 scipy==1.14.1
 pyOpenSSL>=24.2.1
+langdetect>=1.0.9
diff --git a/system/requirements/requirements_textgen.txt b/system/requirements/requirements_textgen.txt
@@ -33,3 +33,4 @@ piper-phonemize==1.1.0; sys_platform == "darwin"
 plotly==5.24.1
 scipy==1.14.1
 pyOpenSSL>=24.2.1
+langdetect>=1.0.9
diff --git a/system/tts_engines/f5tts/f5tts_settings_page.py b/system/tts_engines/f5tts/f5tts_settings_page.py
@@ -140,21 +140,25 @@ def transcribe_files(model_name: str, use_cpu: bool = False, progress=gr.Progres
 # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file.
 #
 # You do not need to modify the function's logic or any other part of the code.
-def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr,  alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
+def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr,  alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr):
     # Load the model_config_data from the JSON file
     with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
         model_config_data = json.load(f)
     # Update the settings and openai_voices dictionaries with the new values
     model_config_data["settings"]["def_character_voice"] = def_character_voice_gr
     model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr
     model_config_data["openai_voices"]["alloy"] = alloy_gr
+    model_config_data["openai_voices"]["ash"] = ash_gr
+    model_config_data["openai_voices"]["coral"] = coral_gr
     model_config_data["openai_voices"]["echo"] = echo_gr
     model_config_data["openai_voices"]["fable"] = fable_gr
     model_config_data["openai_voices"]["nova"] = nova_gr
     model_config_data["openai_voices"]["onyx"] = onyx_gr
+    model_config_data["openai_voices"]["sage"] = sage_gr
     model_config_data["openai_voices"]["shimmer"] = shimmer_gr
     model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
     model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
+    model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
     model_config_data["settings"]["temperature_set"] = temperature_set_gr
     model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
     model_config_data["settings"]["pitch_set"] = pitch_set_gr
@@ -192,6 +196,7 @@ def f5tts_model_alltalk_settings(model_config_data):
             with gr.Row():
                 lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
                 deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
+                streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
                 temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
                 repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
                 pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
@@ -202,12 +207,17 @@ def f5tts_model_alltalk_settings(model_config_data):
                     with gr.Group():
                         with gr.Row():
                             alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True)
+                            ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True)
+                        with gr.Row():
+                            coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True)
                             echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True)
                         with gr.Row():
                             fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True)
                             nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True)
                         with gr.Row():
                             onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True)
+                            sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True)
+                        with gr.Row():
                             shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True)
                 with gr.Column():
                     gr.Markdown("### Default Voices")         
@@ -226,7 +236,7 @@ def f5tts_model_alltalk_settings(model_config_data):
                 with gr.Row():
                     gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown")
                     gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown")                
-            submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
+            submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message)
 
         ###########################################################################################
         # Do not change this section apart from "TTS Engine Name" value to match your engine name #

diff --git a/system/tts_engines/f5tts/help_content.py b/system/tts_engines/f5tts/help_content.py
@@ -237,6 +237,11 @@ class AllTalkHelpContent:
         - Accelerates TTS generation using optimized inference
         - Only available for engines and models that support DeepSpeed
         - Requires NVIDIA GPU with CUDA support
+
+    - **Stream Response Capability**
+        - Enables real-time streaming of generated speech output
+        - Reduces latency for faster feedback during synthesis
+        - Only available for engines and models that support Streaming
 
     - **Temperature Control**
         - Adjusts the variability in speech generation
@@ -267,12 +272,15 @@ class AllTalkHelpContent:
 
     ### OpenAI Voice Mappings
     - Only relevant when using the OpenAI-compatible API endpoint
-    - Maps OpenAI's six standard voices to equivalent voices in the current engine:
+    - Maps OpenAI's nine standard voices to equivalent voices in the current engine:
         - `alloy`
+        - `ash`
+        - `coral`
         - `echo`
         - `fable`
         - `nova`
         - `onyx`
+        - `sage`
         - `shimmer`
     - Essential for maintaining compatibility with OpenAI API calls
     - Each mapping can be customized to any available voice in the current engine

diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py
@@ -144,6 +144,7 @@ def __init__(self):
         self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"]                      # What is the current default main/character voice that will be used if no voice specified.
         self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"]                        # What is the current default narrator voice that will be used if no voice specified.
         self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"]                          # If its available, is DeepSpeed enabled for the TTS engine
+        self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"]                          # If its available, is Streaming enabled for the TTS engine
         self.engine_installed = tts_model_loaded["settings"]["engine_installed"]                            # Has the TTS engine been setup/installed (not curently used)
         self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"]                      # What is the set/stored speed for generation.
         self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"]                              # If its available, is LowVRAM enabled for the TTS engine
@@ -154,10 +155,13 @@ def __init__(self):
         self.pitch_set = tts_model_loaded["settings"]["pitch_set"]                                          # What is the currenly set pitch of the model (If it support temp)
         # Gather the OpenAI API Voice Mappings
         self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"]                                      # The TTS engine voice that will be mapped to Open AI Alloy voice
+        self.openai_ash = tts_model_loaded["openai_voices"]["ash"]                                          # The TTS engine voice that will be mapped to Open AI Ash voice
+        self.openai_coral = tts_model_loaded["openai_voices"]["coral"]                                      # The TTS engine voice that will be mapped to Open AI Coral voice
         self.openai_echo = tts_model_loaded["openai_voices"]["echo"]                                        # The TTS engine voice that will be mapped to Open AI Echo voice
         self.openai_fable = tts_model_loaded["openai_voices"]["fable"]                                      # The TTS engine voice that will be mapped to Open AI Fable voice
         self.openai_nova = tts_model_loaded["openai_voices"]["nova"]                                        # The TTS engine voice that will be mapped to Open AI Nova voice
         self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"]                                        # The TTS engine voice that will be mapped to Open AI Onyx voice
+        self.openai_sage = tts_model_loaded["openai_voices"]["sage"]                                        # The TTS engine voice that will be mapped to Open AI Sage voice
         self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"]                                  # The TTS engine voice that will be mapped to Open AI Shimmer voice
         ###################################################################
         # DONT CHANGE #  Load params and api_defaults from confignew.json #
@@ -403,12 +407,19 @@ def scan_models_folder(self):
                 if not model_files:
                     # If no model_*.safetensors found, try any .safetensors file
                     model_files = list(model_dir.glob("*.safetensors"))
-
+                if not model_files:
+                # Try finding a pt model file as fallback
+                    # If no model_*.safetensors found, try finding a .pt model file
+                    model_files = list(model_dir.glob("model_*.pt"))
+                if not model_files:
+                    # If no model_*.safetensors found, try any .pt file
+                    model_files = list(model_dir.glob("*.pt"))
+
                 vocab_file = model_dir / "vocab.txt"
                 vocos_dir = model_dir / "vocos"
                 vocos_config = vocos_dir / "config.yaml"
                 vocos_model = vocos_dir / "pytorch_model.bin"
-                
+
                 # Check if we have at least one model file and all other required files
                 if model_files and all(f.exists() for f in [vocab_file, vocos_config, vocos_model]):
                     model_name = model_dir.name
@@ -506,11 +517,28 @@ async def api_manual_load_model(self, model_name):
             vocab_path = model_dir / "vocab.txt"
             vocos_path = model_dir / "vocos"
 
-            # Dynamically find the safetensors model file
+            # Dynamically find the safetensors or pickletensor model file
+            model_is_pickle = False
             model_files = list(model_dir.glob("model_*.safetensors"))
             if not model_files:
                 # Try finding any safetensors file as fallback
                 model_files = list(model_dir.glob("*.safetensors"))
+            if not model_files:
+                # Try finding the pt model file as fallback
+                model_files = list(model_dir.glob("model_*.pt"))
+                model_is_pickle = True
+            if not model_files:
+                # Try finding any pt file as fallback
+                model_files = list(model_dir.glob("*.pt"))
+                model_is_pickle = True
+
+            if model_is_pickle:
+                print(
+                    f"[{self.branding}ENG] \033[91mWarning\033[0m: The models found in '{model_dir}' are in Pickle format (.pt). "
+                    f"This format poses security risks due to potential arbitrary code execution. "
+                    f"Please ensure the source of the models is trusted. We recommend using 'safetensors' format for enhanced security. "
+                    f"For more information, visit: https://huggingface.co/docs/hub/en/security-pickle"
+                )
 
             if not model_files:
                 print(f"[{self.branding}ENG] \033[91mError\033[0m: No model's safetensors file was found in the F5-TTS models directory.")
@@ -1082,7 +1110,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
 
             generate_end_time = time.time()
             generate_elapsed_time = generate_end_time - generate_start_time
-            print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m")
+            print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m")
 
             if streaming:
                 with open(output_file, 'rb') as f:

diff --git a/system/tts_engines/f5tts/model_settings.json b/system/tts_engines/f5tts/model_settings.json
@@ -24,6 +24,7 @@
         "def_character_voice": "female_01.wav",
         "def_narrator_voice": "female_01.wav",
         "deepspeed_enabled": false,
+        "streaming_enabled": false,
         "engine_installed": true,
         "generationspeed_set": 0.9,
         "lowvram_enabled": true,
@@ -33,10 +34,13 @@
     },
     "openai_voices": {
         "alloy": "female_01.wav",
+        "ash": "female_01.wav",
+        "coral": "female_01.wav",
         "echo": "female_01.wav",
         "fable": "female_01.wav",
         "nova": "female_01.wav",
         "onyx": "female_01.wav",
-        "shimmer": "female_01.wavf"
+        "sage": "female_01.wav",
+        "shimmer": "female_01.wav"
     }
 }
diff --git a/system/tts_engines/parler/help_content.py b/system/tts_engines/parler/help_content.py
@@ -237,6 +237,11 @@ class AllTalkHelpContent:
         - Accelerates TTS generation using optimized inference
         - Only available for engines and models that support DeepSpeed
         - Requires NVIDIA GPU with CUDA support
+
+    - **Stream Response Capability**
+        - Enables real-time streaming of generated speech output
+        - Reduces latency for faster feedback during synthesis
+        - Only available for engines and models that support Streaming
 
     - **Temperature Control**
         - Adjusts the variability in speech generation
@@ -267,12 +272,15 @@ class AllTalkHelpContent:
 
     ### OpenAI Voice Mappings
     - Only relevant when using the OpenAI-compatible API endpoint
-    - Maps OpenAI's six standard voices to equivalent voices in the current engine:
+    - Maps OpenAI's nine standard voices to equivalent voices in the current engine:
         - `alloy`
+        - `ash`
+        - `coral`
         - `echo`
         - `fable`
         - `nova`
         - `onyx`
+        - `sage`
         - `shimmer`
     - Essential for maintaining compatibility with OpenAI API calls
     - Each mapping can be customized to any available voice in the current engine
-Original file line number
+Diff line change
@@ Expand Up / @@ -3309,6 +3309,7 @@ def on_load(request: gr.Request): @@
                             gen_lang = gr.Dropdown(
                                 value=config.api_def.api_language,
                                 choices=[
+                                    "auto",
                                     "ar",
                                     "zh",
                                     "cs",
@@ Expand Down @@