Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ability to use language subfolders in xtts voicesets #481

Open
wants to merge 18 commits into
base: alltalkbeta
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions script.py
Original file line number Diff line number Diff line change
Expand Up @@ -3309,6 +3309,7 @@ def on_load(request: gr.Request):
gen_lang = gr.Dropdown(
value=config.api_def.api_language,
choices=[
"auto",
"ar",
"zh",
"cs",
Expand Down
3 changes: 3 additions & 0 deletions system/openaittstest.html
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,13 @@ <h1>OpenAI API/AllTalk TTS API Test</h1>
<label for="voice">Voice:</label>
<select id="voice" name="voice" required>
<option value="alloy">Alloy</option>
<option value="ash">Ash</option>
<option value="Coral">Coral</option>
<option value="echo">Echo</option>
<option value="fable">Fable</option>
<option value="nova">Nova</option>
<option value="onyx">Onyx</option>
<option value="sage">Sage</option>
<option value="shimmer">Shimmer</option>
</select>
</div>
Expand Down
1 change: 1 addition & 0 deletions system/requirements/requirements_colab.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,4 @@ piper-tts; sys_platform == "linux"
plotly==5.24.1
scipy==1.14.1
pyOpenSSL>=24.2.1
langdetect>=1.0.9
1 change: 1 addition & 0 deletions system/requirements/requirements_standalone.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ fastapi==0.112.2
plotly==5.24.1
scipy==1.14.1
pyOpenSSL>=24.2.1
langdetect>=1.0.9
1 change: 1 addition & 0 deletions system/requirements/requirements_textgen.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ piper-phonemize==1.1.0; sys_platform == "darwin"
plotly==5.24.1
scipy==1.14.1
pyOpenSSL>=24.2.1
langdetect>=1.0.9
14 changes: 12 additions & 2 deletions system/tts_engines/f5tts/f5tts_settings_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,21 +140,25 @@ def transcribe_files(model_name: str, use_cpu: bool = False, progress=gr.Progres
# dictionaries with the values provided as arguments, and save the updated settings back to the JSON file.
#
# You do not need to modify the function's logic or any other part of the code.
def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr):
# Load the model_config_data from the JSON file
with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
model_config_data = json.load(f)
# Update the settings and openai_voices dictionaries with the new values
model_config_data["settings"]["def_character_voice"] = def_character_voice_gr
model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr
model_config_data["openai_voices"]["alloy"] = alloy_gr
model_config_data["openai_voices"]["ash"] = ash_gr
model_config_data["openai_voices"]["coral"] = coral_gr
model_config_data["openai_voices"]["echo"] = echo_gr
model_config_data["openai_voices"]["fable"] = fable_gr
model_config_data["openai_voices"]["nova"] = nova_gr
model_config_data["openai_voices"]["onyx"] = onyx_gr
model_config_data["openai_voices"]["sage"] = sage_gr
model_config_data["openai_voices"]["shimmer"] = shimmer_gr
model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
model_config_data["settings"]["temperature_set"] = temperature_set_gr
model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
model_config_data["settings"]["pitch_set"] = pitch_set_gr
Expand Down Expand Up @@ -192,6 +196,7 @@ def f5tts_model_alltalk_settings(model_config_data):
with gr.Row():
lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
Expand All @@ -202,12 +207,17 @@ def f5tts_model_alltalk_settings(model_config_data):
with gr.Group():
with gr.Row():
alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True)
ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True)
with gr.Row():
coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True)
echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True)
with gr.Row():
fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True)
nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True)
with gr.Row():
onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True)
sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True)
with gr.Row():
shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True)
with gr.Column():
gr.Markdown("### Default Voices")
Expand All @@ -226,7 +236,7 @@ def f5tts_model_alltalk_settings(model_config_data):
with gr.Row():
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown")
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown")
submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message)

###########################################################################################
# Do not change this section apart from "TTS Engine Name" value to match your engine name #
Expand Down
10 changes: 9 additions & 1 deletion system/tts_engines/f5tts/help_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support

- **Stream Response Capability**
- Enables real-time streaming of generated speech output
- Reduces latency for faster feedback during synthesis
- Only available for engines and models that support Streaming

- **Temperature Control**
- Adjusts the variability in speech generation
Expand Down Expand Up @@ -267,12 +272,15 @@ class AllTalkHelpContent:

### OpenAI Voice Mappings
- Only relevant when using the OpenAI-compatible API endpoint
- Maps OpenAI's six standard voices to equivalent voices in the current engine:
- Maps OpenAI's nine standard voices to equivalent voices in the current engine:
- `alloy`
- `ash`
- `coral`
- `echo`
- `fable`
- `nova`
- `onyx`
- `sage`
- `shimmer`
- Essential for maintaining compatibility with OpenAI API calls
- Each mapping can be customized to any available voice in the current engine
Expand Down
36 changes: 32 additions & 4 deletions system/tts_engines/f5tts/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ def __init__(self):
self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified.
self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified.
self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine
self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine
self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used)
self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation.
self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine
Expand All @@ -154,10 +155,13 @@ def __init__(self):
self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp)
# Gather the OpenAI API Voice Mappings
self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice
self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice
self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice
self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice
self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice
self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice
self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice
self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice
self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice
###################################################################
# DONT CHANGE # Load params and api_defaults from confignew.json #
Expand Down Expand Up @@ -403,12 +407,19 @@ def scan_models_folder(self):
if not model_files:
# If no model_*.safetensors found, try any .safetensors file
model_files = list(model_dir.glob("*.safetensors"))

if not model_files:
# Try finding a pt model file as fallback
# If no model_*.safetensors found, try finding a .pt model file
model_files = list(model_dir.glob("model_*.pt"))
if not model_files:
# If no model_*.safetensors found, try any .pt file
model_files = list(model_dir.glob("*.pt"))

vocab_file = model_dir / "vocab.txt"
vocos_dir = model_dir / "vocos"
vocos_config = vocos_dir / "config.yaml"
vocos_model = vocos_dir / "pytorch_model.bin"

# Check if we have at least one model file and all other required files
if model_files and all(f.exists() for f in [vocab_file, vocos_config, vocos_model]):
model_name = model_dir.name
Expand Down Expand Up @@ -506,11 +517,28 @@ async def api_manual_load_model(self, model_name):
vocab_path = model_dir / "vocab.txt"
vocos_path = model_dir / "vocos"

# Dynamically find the safetensors model file
# Dynamically find the safetensors or pickletensor model file
model_is_pickle = False
model_files = list(model_dir.glob("model_*.safetensors"))
if not model_files:
# Try finding any safetensors file as fallback
model_files = list(model_dir.glob("*.safetensors"))
if not model_files:
# Try finding the pt model file as fallback
model_files = list(model_dir.glob("model_*.pt"))
model_is_pickle = True
if not model_files:
# Try finding any pt file as fallback
model_files = list(model_dir.glob("*.pt"))
model_is_pickle = True

if model_is_pickle:
print(
f"[{self.branding}ENG] \033[91mWarning\033[0m: The models found in '{model_dir}' are in Pickle format (.pt). "
f"This format poses security risks due to potential arbitrary code execution. "
f"Please ensure the source of the models is trusted. We recommend using 'safetensors' format for enhanced security. "
f"For more information, visit: https://huggingface.co/docs/hub/en/security-pickle"
)

if not model_files:
print(f"[{self.branding}ENG] \033[91mError\033[0m: No model's safetensors file was found in the F5-TTS models directory.")
Expand Down Expand Up @@ -1082,7 +1110,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena

generate_end_time = time.time()
generate_elapsed_time = generate_end_time - generate_start_time
print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m")
print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m")

if streaming:
with open(output_file, 'rb') as f:
Expand Down
6 changes: 5 additions & 1 deletion system/tts_engines/f5tts/model_settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"def_character_voice": "female_01.wav",
"def_narrator_voice": "female_01.wav",
"deepspeed_enabled": false,
"streaming_enabled": false,
"engine_installed": true,
"generationspeed_set": 0.9,
"lowvram_enabled": true,
Expand All @@ -33,10 +34,13 @@
},
"openai_voices": {
"alloy": "female_01.wav",
"ash": "female_01.wav",
"coral": "female_01.wav",
"echo": "female_01.wav",
"fable": "female_01.wav",
"nova": "female_01.wav",
"onyx": "female_01.wav",
"shimmer": "female_01.wavf"
"sage": "female_01.wav",
"shimmer": "female_01.wav"
}
}
10 changes: 9 additions & 1 deletion system/tts_engines/parler/help_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support

- **Stream Response Capability**
- Enables real-time streaming of generated speech output
- Reduces latency for faster feedback during synthesis
- Only available for engines and models that support Streaming

- **Temperature Control**
- Adjusts the variability in speech generation
Expand Down Expand Up @@ -267,12 +272,15 @@ class AllTalkHelpContent:

### OpenAI Voice Mappings
- Only relevant when using the OpenAI-compatible API endpoint
- Maps OpenAI's six standard voices to equivalent voices in the current engine:
- Maps OpenAI's nine standard voices to equivalent voices in the current engine:
- `alloy`
- `ash`
- `coral`
- `echo`
- `fable`
- `nova`
- `onyx`
- `sage`
- `shimmer`
- Essential for maintaining compatibility with OpenAI API calls
- Each mapping can be customized to any available voice in the current engine
Expand Down
Loading