Skip to content

Commit

Permalink
Multi audio sample TTS generation w/XTTS
Browse files Browse the repository at this point in the history
🟧 Using Single Voice Samples
Voice samples are stored in `/alltalk_tts/voices/` and should be named using the following format `name.wav`. These files will be listed as `name.wav` in the available voices list.

🟧 Using Multiple Voice Samples
If you have multiple voice samples for a single voice, you can organize them into subfolders within the `/alltalk_tts/voices/` directory. Each subfolder should be named according to the voice it contains, up to 5 voice samples will be randomly selected for use.

     • Each subfolder should reflect the name or type of the voice it contains (e.g., female_voice, male_voice).
     • The voice samples inside each subfolder should follow the standard .wav format.
     • An example folder path would be `/alltalk_tts/voices/mynewvoice/` and this would be listed in the available voices list as `mynewvoice/`.

This organization allows for easy selection and management of multiple voice samples while ensuring the system can correctly identify and utilize each voice. Manual CURL API requests would send the folder in the format `mynewvoice/`.
  • Loading branch information
erew123 authored Aug 30, 2024
1 parent 5cb09eb commit b7aa3a7
Show file tree
Hide file tree
Showing 2 changed files with 391 additions and 353 deletions.
41 changes: 35 additions & 6 deletions system/tts_engines/xtts/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import torchaudio
import wave
import io
import random
import numpy as np
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
Expand Down Expand Up @@ -287,7 +288,19 @@ def voices_file_list(self):
# ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑

directory = self.main_dir / "voices"
voices = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith(".wav")]
# Step 1: Add .wav files in the main "voices" directory to the list
voices.extend([f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith(".wav")])

# Step 2: Walk through subfolders and add subfolder names if they contain .wav files
for root, dirs, files in os.walk(directory):
# Skip the root directory itself and only consider subfolders
if os.path.normpath(root) != os.path.normpath(directory):
if any(f.endswith(".wav") for f in files):
folder_name = os.path.basename(root) + "/"
voices.append(folder_name)

# Remove "voices/" from the list if it somehow got added
voices = [v for v in voices if v != "voices/"]

# ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
# ↓↓↓ Keep everything below this line ↓↓↓
Expand Down Expand Up @@ -470,15 +483,31 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena

# XTTSv2 LOCAL & Xttsv2 FT Method
print(f"[{self.branding}Debug] Deciding if streaming or not") if self.debug_tts else None
print(f"[{self.branding}Debug] self.current_model_loaded", self.current_model_loaded) if self.debug_tts else None
print(f"[{self.branding}Debug] self.current_model_loaded is:", self.current_model_loaded) if self.debug_tts else None
self.current_model_loaded
if os.path.isdir(voice):
wavs_files = glob.glob(os.path.join(voice, "*.wav"))
print(f"[{self.branding}Debug] Audio Sample Detection") if self.debug_tts else None
print(f"[{self.branding}Debug] Voice name sent in request is:", voice) if self.debug_tts else None
# Check if the voice ends with a slash, indicating it's a directory
if voice.endswith("/") or voice.endswith("\\"):
# Remove the trailing slash for proper path detection
voice = voice.rstrip("/\\")
if os.path.isdir(os.path.join(self.main_dir, "voices", voice)):
# Normalize the path for the directory and then search for .wav files
normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice))
wavs_files = glob.glob(os.path.join(normalized_path, "*.wav"))
print(f"[{self.branding}Debug] Directory of multiple voice samples detected. Using multiple WAV files:", wavs_files) if self.debug_tts else None
# If there are more than 5 .wav files, randomly select 5
if len(wavs_files) > 5:
wavs_files = random.sample(wavs_files, 5)
print(f"[{self.branding}Debug] More than 5 wav files detected so only using 5x random audio samples:", wavs_files) if self.debug_tts else None
else:
wavs_files = [f"{self.main_dir}/voices/{voice}"]
# Normalize the path for the file
normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice))
wavs_files = [normalized_path]
print(f"[{self.branding}Debug] Single voice sample detected. Using one WAV sample:", wavs_files) if self.debug_tts else None

if self.current_model_loaded.startswith ("xtts"):
print(f"[{self.branding}Debug] Text arriving at engine {text}") if self.debug_tts else None
print(f"[{self.branding}Debug] Text arriving at TTS engine is: {text}") if self.debug_tts else None
gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
audio_path=wavs_files,
gpt_cond_len=self.model.config.gpt_cond_len,
Expand Down
Loading

0 comments on commit b7aa3a7

Please sign in to comment.