Xtts demo #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

suminhthanh wants to merge 40 commits into dev from xtts_demo

TTS/demos/xtts_ft_demo/requirements.txt

Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		faster_whisper==1.0.2
		gradio==4.7.1

TTS/demos/xtts_ft_demo/utils/formatter.py

-Original file line number
+Diff line change
@@ -0,0 +1,160 @@
+    import os
+    import gc
+    import torchaudio
+    import pandas
+    from faster_whisper import WhisperModel
+    from glob import glob
+    from tqdm import tqdm
+    import torch
+    import torchaudio
+    # torch.set_num_threads(1)
+    from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
+    torch.set_num_threads(16)
+    import os
+    audio_types = (".wav", ".mp3", ".flac")
+    def list_audios(basePath, contains=None):
+        # return the set of files that are valid
+        return list_files(basePath, validExts=audio_types, contains=contains)
+    def list_files(basePath, validExts=None, contains=None):
+        # loop over the directory structure
+        for (rootDir, dirNames, filenames) in os.walk(basePath):
+            # loop over the filenames in the current directory
+            for filename in filenames:
+                # if the contains string is not none and the filename does not contain
+                # the supplied string, then ignore the file
+                if contains is not None and filename.find(contains) == -1:
+                    continue
+                # determine the file extension of the current file
+                ext = filename[filename.rfind("."):].lower()
+                # check to see if the file is an audio and should be processed
+                if validExts is None or ext.endswith(validExts):
+                    # construct the path to the audio and yield it
+                    audioPath = os.path.join(rootDir, filename)
+                    yield audioPath
+    def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
+        audio_total_size = 0
+        # make sure that ooutput file exists
+        os.makedirs(out_path, exist_ok=True)
+        # Loading Whisper
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print("Loading Whisper Model!")
+        asr_model = WhisperModel("large-v3", device=device, compute_type="float16")
+        metadata = {"audio_file": [], "text": [], "speaker_name": []}
+        if gradio_progress is not None:
+            tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
+        else:
+            tqdm_object = tqdm(audio_files)
+        for audio_path in tqdm_object:
+            wav, sr = torchaudio.load(audio_path)
+            # stereo to mono if needed
+            if wav.size(0) != 1:
+                wav = torch.mean(wav, dim=0, keepdim=True)
+            wav = wav.squeeze()
+            audio_total_size += (wav.size(-1) / sr)
+            segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
+            segments = list(segments)
+            i = 0
+            sentence = ""
+            sentence_start = None
+            first_word = True
+            # added all segments words in a unique list
+            words_list = []
+            for _, segment in enumerate(segments):
+                words = list(segment.words)
+                words_list.extend(words)
+            # process each word
+            for word_idx, word in enumerate(words_list):
+                if first_word:
+                    sentence_start = word.start
+                    # If it is the first sentence, add buffer or get the begining of the file
+                    if word_idx == 0:
+                        sentence_start = max(sentence_start - buffer, 0)  # Add buffer to the sentence start
+                    else:
+                        # get previous sentence end
+                        previous_word_end = words_list[word_idx - 1].end
+                        # add buffer or get the silence midle between the previous sentence and the current one
+                        sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
+                    sentence = word.word
+                    first_word = False
+                else:
+                    sentence += word.word
+                if word.word[-1] in ["!", ".", "?"]:
+                    sentence = sentence[1:]
+                    # Expand number and abbreviations plus normalization
+                    sentence = multilingual_cleaners(sentence, target_language)
+                    audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
+                    audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
+                    # Check for the next word's existence
+                    if word_idx + 1 < len(words_list):
+                        next_word_start = words_list[word_idx + 1].start
+                    else:
+                        # If don't have more words it means that it is the last sentence then use the audio len as next word start
+                        next_word_start = (wav.shape[0] - 1) / sr
+                    # Average the current word end and next word start
+                    word_end = min((word.end + next_word_start) / 2, word.end + buffer)
+                    absoulte_path = os.path.join(out_path, audio_file)
+                    os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
+                    i += 1
+                    first_word = True
+                    audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
+                    # if the audio is too short ignore it (i.e < 0.33 seconds)
+                    if audio.size(-1) >= sr/3:
+                        torchaudio.save(absoulte_path,
+                            audio,
+                            sr
+                        )
+                    else:
+                        continue
+                    metadata["audio_file"].append(audio_file)
+                    metadata["text"].append(sentence)
+                    metadata["speaker_name"].append(speaker_name)
+        df = pandas.DataFrame(metadata)
+        df = df.sample(frac=1)
+        num_val_samples = int(len(df)*eval_percentage)
+        df_eval = df[:num_val_samples]
+        df_train = df[num_val_samples:]
+        df_train = df_train.sort_values('audio_file')
+        train_metadata_path = os.path.join(out_path, "metadata_train.csv")
+        df_train.to_csv(train_metadata_path, sep="|", index=False)
+        eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
+        df_eval = df_eval.sort_values('audio_file')
+        df_eval.to_csv(eval_metadata_path, sep="|", index=False)
+        # deallocate VRAM and RAM
+        del asr_model, df_train, df_eval, df, metadata
+        gc.collect()
+        return train_metadata_path, eval_metadata_path, audio_total_size

TTS/demos/xtts_ft_demo/utils/gpt_train.py

-Original file line number
+Diff line change
@@ -0,0 +1,172 @@
+    import os
+    import gc
+    from trainer import Trainer, TrainerArgs
+    from TTS.config.shared_configs import BaseDatasetConfig
+    from TTS.tts.datasets import load_tts_samples
+    from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+    from TTS.utils.manage import ModelManager
+    def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
+        #  Logging parameters
+        RUN_NAME = "GPT_XTTS_FT"
+        PROJECT_NAME = "XTTS_trainer"
+        DASHBOARD_LOGGER = "tensorboard"
+        LOGGER_URI = None
+        # Set here the path that the checkpoints will be saved. Default: ./run/training/
+        OUT_PATH = os.path.join(output_path, "run", "training")
+        # Training Parameters
+        OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
+        START_WITH_EVAL = False  # if True it will star with evaluation
+        BATCH_SIZE = batch_size  # set here the batch size
+        GRAD_ACUMM_STEPS = grad_acumm  # set here the grad accumulation steps
+        # Define here the dataset that you want to use for the fine-tuning on.
+        config_dataset = BaseDatasetConfig(
+            formatter="coqui",
+            dataset_name="ft_dataset",
+            path=os.path.dirname(train_csv),
+            meta_file_train=train_csv,
+            meta_file_val=eval_csv,
+            language=language,
+        )
+        # Add here the configs of the datasets
+        DATASETS_CONFIG_LIST = [config_dataset]
+        # Define the path where XTTS v2.0.1 files will be downloaded
+        CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
+        os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
+        # DVAE files
+        DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
+        MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
+        # Set the path to the downloaded files
+        DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
+        MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
+        # download DVAE files if needed
+        if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
+            print(" > Downloading DVAE files!")
+            ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
+        # Download XTTS v2.0 checkpoint if needed
+        TOKENIZER_FILE_LINK = "https://huggingface.co/capleaf/viXTTS/resolve/main/vocab.json"
+        XTTS_CHECKPOINT_LINK = "https://huggingface.co/capleaf/viXTTS/resolve/main/model.pth"
+        XTTS_CONFIG_LINK = "https://huggingface.co/capleaf/viXTTS/resolve/main/config.json"
+        # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+        TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
+        XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
+        XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK))  # config.json file
+        # download XTTS v2.0 files if needed
+        if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
+            print(" > Downloading XTTS v2.0 files!")
+            ModelManager._download_model_files(
+                [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+            )
+        # init args and config
+        model_args = GPTArgs(
+            max_conditioning_length=132300,  # 6 secs
+            min_conditioning_length=66150,  # 3 secs
+            debug_loading_failures=False,
+            max_wav_length=max_audio_length,  # ~11.6 seconds
+            max_text_length=200,
+            mel_norm_file=MEL_NORM_FILE,
+            dvae_checkpoint=DVAE_CHECKPOINT,
+            xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+            tokenizer_file=TOKENIZER_FILE,
+            gpt_num_audio_tokens=1026,
+            gpt_start_audio_token=1024,
+            gpt_stop_audio_token=1025,
+            gpt_use_masking_gt_prompt_approach=True,
+            gpt_use_perceiver_resampler=True,
+        )
+        # define audio config
+        audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+        # training parameters config
+        config = GPTTrainerConfig(
+            epochs=num_epochs,
+            output_path=OUT_PATH,
+            model_args=model_args,
+            run_name=RUN_NAME,
+            project_name=PROJECT_NAME,
+            run_description="""
+                GPT XTTS training
+                """,
+            dashboard_logger=DASHBOARD_LOGGER,
+            logger_uri=LOGGER_URI,
+            audio=audio_config,
+            batch_size=BATCH_SIZE,
+            batch_group_size=48,
+            eval_batch_size=BATCH_SIZE,
+            num_loader_workers=8,
+            eval_split_max_size=256,
+            print_step=50,
+            plot_step=100,
+            log_model_step=100,
+            save_step=1000,
+            save_n_checkpoints=1,
+            save_checkpoints=True,
+            # target_loss="loss",
+            print_eval=False,
+            # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+            optimizer="AdamW",
+            optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+            optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+            lr=5e-06,  # learning rate
+            lr_scheduler="MultiStepLR",
+            # it was adjusted accordly for the new step scheme
+            lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+            test_sentences=[],
+        )
+        # init the model from config
+        model = GPTTrainer.init_from_config(config)
+        # load training samples
+        train_samples, eval_samples = load_tts_samples(
+            DATASETS_CONFIG_LIST,
+            eval_split=True,
+            eval_split_max_size=config.eval_split_max_size,
+            eval_split_size=config.eval_split_size,
+        )
+        # init the trainer and 🚀
+        trainer = Trainer(
+            TrainerArgs(
+                restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+                skip_train_epoch=False,
+                start_with_eval=START_WITH_EVAL,
+                grad_accum_steps=GRAD_ACUMM_STEPS,
+            ),
+            config,
+            output_path=OUT_PATH,
+            model=model,
+            train_samples=train_samples,
+            eval_samples=eval_samples,
+        )
+        trainer.fit()
+        # get the longest text audio file to use as speaker reference
+        samples_len = [len(item["text"].split(" ")) for item in train_samples]
+        longest_text_idx =  samples_len.index(max(samples_len))
+        speaker_ref = train_samples[longest_text_idx]["audio_file"]
+        trainer_out_path = trainer.output_path
+        # deallocate VRAM and RAM
+        del model, trainer, train_samples, eval_samples
+        gc.collect()
+        return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Xtts demo #1

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Xtts demo #1

Are you sure you want to change the base?

Uh oh!

Xtts demo #1

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!