From d44d721d6920297a0d3d06bd27c3a372613a80bb Mon Sep 17 00:00:00 2001 From: IIEleven11 Date: Wed, 31 Jul 2024 18:30:47 -0700 Subject: [PATCH 1/5] Tokenizer changes --- finetune.py | 32 ++++-- start_alltalk.sh | 4 + start_diagnostics.sh | 4 + start_environment.sh | 16 +++ start_finetune.sh | 5 + system/ft_tokenizer/compare_and_merge.py | 48 +++++++++ system/ft_tokenizer/expand_xtts.py | 126 +++++++++++++++++++++++ 7 files changed, 227 insertions(+), 8 deletions(-) create mode 100755 start_alltalk.sh create mode 100755 start_diagnostics.sh create mode 100755 start_environment.sh create mode 100755 start_finetune.sh create mode 100644 system/ft_tokenizer/compare_and_merge.py create mode 100644 system/ft_tokenizer/expand_xtts.py diff --git a/finetune.py b/finetune.py index f4b11f8b..073bf71f 100644 --- a/finetune.py +++ b/finetune.py @@ -2376,12 +2376,28 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n ) model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None) - demo.queue().launch( - show_api=False, - inbrowser=True, - share=False, - debug=False, - server_port=7052, - server_name="127.0.0.1", - ) +import random +def find_available_port(start_port, end_port): + ports = list(range(start_port, end_port + 1)) + random.shuffle(ports) + return ports + +ports_to_try = find_available_port(7800, 7810) + +for port in ports_to_try: + try: + demo.queue().launch( + show_api=False, + inbrowser=True, + share=False, + debug=False, + server_port=port, + server_name="127.0.0.1", + ) + print(f"Successfully launched on port {port}") + break + except OSError: + print(f"Port {port} is not available, trying next...") +else: + print("No available ports in the specified range.") diff --git a/start_alltalk.sh b/start_alltalk.sh new file mode 100755 index 00000000..55e0756c --- /dev/null +++ b/start_alltalk.sh @@ -0,0 +1,4 @@ +#!/bin/bash +source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh" +conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" +python script.py diff --git a/start_diagnostics.sh b/start_diagnostics.sh new file mode 100755 index 00000000..c5586879 --- /dev/null +++ b/start_diagnostics.sh @@ -0,0 +1,4 @@ +#!/bin/bash +source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh" +conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" +python diagnostics.py diff --git a/start_environment.sh b/start_environment.sh new file mode 100755 index 00000000..9791ff36 --- /dev/null +++ b/start_environment.sh @@ -0,0 +1,16 @@ +#!/bin/bash +cd "." +if [[ "/home/eleven/alltalkbeta/alltalk_tts" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi +# deactivate existing conda envs as needed to avoid conflicts +{ conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null +# config +CONDA_ROOT_PREFIX="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda" +INSTALL_ENV_DIR="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" +# environment isolation +export PYTHONNOUSERSITE=1 +unset PYTHONPATH +unset PYTHONHOME +export CUDA_PATH="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" +export CUDA_HOME="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" +# activate env +bash --init-file <(echo "source \"/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh\" && conda activate \"/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env\"") diff --git a/start_finetune.sh b/start_finetune.sh new file mode 100755 index 00000000..c09d800b --- /dev/null +++ b/start_finetune.sh @@ -0,0 +1,5 @@ +#!/bin/bash +export TRAINER_TELEMETRY=0 +source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh" +conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" +python finetune.py diff --git a/system/ft_tokenizer/compare_and_merge.py b/system/ft_tokenizer/compare_and_merge.py new file mode 100644 index 00000000..dd1c4534 --- /dev/null +++ b/system/ft_tokenizer/compare_and_merge.py @@ -0,0 +1,48 @@ +import json +import os + +# Define file paths +base_path = "/alltalk_tts/system/ft_tokenizer/" +original_file = os.path.join(base_path, "/alltalk_tts/models/xtts/xttsv2_2.0.3/vocab.json") # base model vocab.json +new_file = os.path.join(base_path, "/expanded_model/bpe_tokenizer-vocab.json") # new bpe tokenizer vocab.json +output_file = os.path.join(base_path, "/expanded_model/expanded_vocab.json") # output expanded_vocab.json file/path + + +with open(original_file, 'r') as f: + original_data = json.load(f) + +with open(new_file, 'r') as f: + new_data = json.load(f) + +# Get the original and new vocabularies +original_vocab = original_data['model']['vocab'] +new_vocab = new_data['model']['vocab'] + +# Find the maximum value in the original vocabulary. We do this so we make sure to append and not overwrite any new values +max_value = max(original_vocab.values()) + +# merge +for key, value in new_vocab.items(): + if key not in original_vocab: + max_value += 1 + original_vocab[key] = max_value + +# Update +original_data['model']['vocab'] = original_vocab + +# Get the original and new merges +original_merges = original_data['model']['merges'] +new_merges = new_data['model']['merges'] + +# Merge the merges +merged_merges = original_merges.copy() +for merge in new_merges: + if merge not in merged_merges: + merged_merges.append(merge) + +# Update +original_data['model']['merges'] = merged_merges +with open(output_file, 'w') as f: + json.dump(original_data, f, ensure_ascii=False, indent=2) + +print(f"Merged vocabulary and merges saved to {output_file}") diff --git a/system/ft_tokenizer/expand_xtts.py b/system/ft_tokenizer/expand_xtts.py new file mode 100644 index 00000000..2c79f0c0 --- /dev/null +++ b/system/ft_tokenizer/expand_xtts.py @@ -0,0 +1,126 @@ +""" +This script does not have any specific alltalk integration yet. So we are manually entering paths and it works as a standalone. + +The script does the following: + - Expands the embedding layer of the base XTTSv2 model according to the user created/trained bpe_tokenizer-vocab.json and the base model vocab.json. +Set variable paths with the base config and base model.pth. +Set the new tokenizer/vocab bpe_tokenizer-vocab.json location. +The new model will be saved at \expanded_models\expanded_model.pth + +Once this is done the new expanded model must be swapped in for the base model.pth then combined with the dvae.pth, vocab.json(bpe_tokenizer-vocab.json), base model config.json, +base model speaker_xtts.pth and base model vocoder.json. + + +I left some print debug statements in the script, they may be nice for the user to see during the process. + +""" + +import torch +import torch.nn as nn +import json +from TTS.tts.models.xtts import Xtts +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTTrainerConfig +from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer + +config_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/config.json" # Path to the base model config.json +pretrained_model_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/model.pth" # Path to the base model.pth +new_tokenizer_path = "/expanded_model/expanded_vocab.json" # Path to the new combined expanded_vocab.json +expanded_model_path = "/expanded_model/expanded_model.pth" # Path to where you want the new expanded_model.pth + + +# Open and load the configuration file +with open(config_path, "r") as f: + config_dict = json.load(f) + +# Create a GPTTrainerConfig object and populate it with the loaded configuration +config = GPTTrainerConfig() +config.from_dict(data=config_dict) + +# Function to get the vocabulary size from a tokenizer file +def get_vocab_size(tokenizer_path): + tokenizer = VoiceBpeTokenizer(vocab_file=tokenizer_path) + return len(tokenizer.tokenizer.get_vocab()) + +# Function to adjust the pretrained model with a new tokenizer +def adjust_pretrained_model( + pretrained_model_path, adjusted_model_path, new_tokenizer_path +): + # Load the pretrained model state dictionary + state_dict = torch.load(pretrained_model_path) + pretrained_state_dict = state_dict["model"] + + # Create a new Xtts model instance with the loaded configuration + model = Xtts(config) + + # Load the pretrained state dictionary into the new model + missing_keys, unexpected_keys = model.load_state_dict( + pretrained_state_dict, strict=False + ) + # Print any missing or unexpected keys for debugging + if missing_keys: + print(f"Missing keys: {missing_keys}") + if unexpected_keys: + print(f"Unexpected keys: {unexpected_keys}") + print("Pretrained model loaded successfully.") + + # Create a new tokenizer with the new vocabulary + new_tokenizer = VoiceBpeTokenizer(vocab_file=new_tokenizer_path) + + # Get the old and new vocabulary sizes, and the embedding dimension + old_vocab_size = model.gpt.text_embedding.num_embeddings + new_vocab_size = len(new_tokenizer.tokenizer.get_vocab()) + embedding_dim = model.gpt.text_embedding.embedding_dim + + # Print vocabulary sizes and embedding dimension for debugging + print(f"Old vocab size: {old_vocab_size}") + print(f"New vocab size: {new_vocab_size}") + print(f"Embedding dimension: {embedding_dim}") + + # Adjust the embedding layer with the new vocabulary size + adjust_embedding_layer( + model, pretrained_state_dict, new_vocab_size, adjusted_model_path + ) + +# Function to adjust the embedding layer for the new vocabulary size +def adjust_embedding_layer( + model, pretrained_state_dict, new_vocab_size, adjusted_model_path +): + old_vocab_size = model.gpt.text_embedding.num_embeddings + embedding_dim = model.gpt.text_embedding.embedding_dim + + # Create new embedding and linear layers with the new vocabulary size + new_text_embedding = nn.Embedding(new_vocab_size, embedding_dim) + new_text_head = nn.Linear(embedding_dim, new_vocab_size) + + # Copy weights from the old embedding layer to the new one + if new_vocab_size > old_vocab_size: + # If the new vocabulary is larger, copy existing weights and initialize new ones + new_text_embedding.weight.data[:old_vocab_size] = ( + model.gpt.text_embedding.weight.data + ) + new_text_head.weight.data[:old_vocab_size] = model.gpt.text_head.weight.data + new_text_head.bias.data[:old_vocab_size] = model.gpt.text_head.bias.data + + # Initialize new weights with normal distribution + new_text_embedding.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02) + new_text_head.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02) + new_text_head.bias.data[old_vocab_size:].normal_(mean=0.0, std=0.02) + else: + # If the new vocabulary is smaller, truncate the existing weights + new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[ + :new_vocab_size + ] + new_text_head.weight.data = model.gpt.text_head.weight.data[:new_vocab_size] + new_text_head.bias.data = model.gpt.text_head.bias.data[:new_vocab_size] + + # Replace the old embedding layer with the new one + model.gpt.text_embedding = new_text_embedding + model.gpt.text_head = new_text_head + + # Save the adjusted model + checkpoint = {"model": model.state_dict()} + torch.save(checkpoint, adjusted_model_path) + print(f"Adjusted model saved to {adjusted_model_path}") + +# Expand the pretrained model with the new tokenizer +adjust_pretrained_model(pretrained_model_path, expanded_model_path, new_tokenizer_path) From 7e2d3455d2234c6807afdf627124de8cfbf6ee2e Mon Sep 17 00:00:00 2001 From: IIEleven11 Date: Thu, 1 Aug 2024 00:05:42 -0700 Subject: [PATCH 2/5] removed unecessary scripts and finetune.py edit --- finetune.py | 26 -------------------------- start_alltalk.sh | 4 ---- start_diagnostics.sh | 4 ---- start_environment.sh | 16 ---------------- start_finetune.sh | 5 ----- 5 files changed, 55 deletions(-) delete mode 100755 start_alltalk.sh delete mode 100755 start_diagnostics.sh delete mode 100755 start_environment.sh delete mode 100755 start_finetune.sh diff --git a/finetune.py b/finetune.py index 073bf71f..ea7286ed 100644 --- a/finetune.py +++ b/finetune.py @@ -2375,29 +2375,3 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n outputs=[final_progress_data], ) model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None) - -import random -def find_available_port(start_port, end_port): - ports = list(range(start_port, end_port + 1)) - random.shuffle(ports) - return ports - -ports_to_try = find_available_port(7800, 7810) - -for port in ports_to_try: - try: - demo.queue().launch( - show_api=False, - inbrowser=True, - share=False, - debug=False, - server_port=port, - server_name="127.0.0.1", - ) - print(f"Successfully launched on port {port}") - break - except OSError: - print(f"Port {port} is not available, trying next...") -else: - print("No available ports in the specified range.") - diff --git a/start_alltalk.sh b/start_alltalk.sh deleted file mode 100755 index 55e0756c..00000000 --- a/start_alltalk.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh" -conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" -python script.py diff --git a/start_diagnostics.sh b/start_diagnostics.sh deleted file mode 100755 index c5586879..00000000 --- a/start_diagnostics.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh" -conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" -python diagnostics.py diff --git a/start_environment.sh b/start_environment.sh deleted file mode 100755 index 9791ff36..00000000 --- a/start_environment.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -cd "." -if [[ "/home/eleven/alltalkbeta/alltalk_tts" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi -# deactivate existing conda envs as needed to avoid conflicts -{ conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null -# config -CONDA_ROOT_PREFIX="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda" -INSTALL_ENV_DIR="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" -# environment isolation -export PYTHONNOUSERSITE=1 -unset PYTHONPATH -unset PYTHONHOME -export CUDA_PATH="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" -export CUDA_HOME="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" -# activate env -bash --init-file <(echo "source \"/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh\" && conda activate \"/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env\"") diff --git a/start_finetune.sh b/start_finetune.sh deleted file mode 100755 index c09d800b..00000000 --- a/start_finetune.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -export TRAINER_TELEMETRY=0 -source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh" -conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env" -python finetune.py From 9047dcb13986d2fbaf052b082f0be5f211ad8523 Mon Sep 17 00:00:00 2001 From: IIEleven11 Date: Thu, 1 Aug 2024 00:07:46 -0700 Subject: [PATCH 3/5] removed unecessary scripts and finetune.py edit --- finetune.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/finetune.py b/finetune.py index ea7286ed..b622baff 100644 --- a/finetune.py +++ b/finetune.py @@ -2375,3 +2375,12 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n outputs=[final_progress_data], ) model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None) + + demo.queue().launch( + show_api=False, + inbrowser=True, + share=False, + debug=False, + server_port=7052, + server_name="127.0.0.1", + ) From 44bd45a6241ba07199088a63b823faf239125e12 Mon Sep 17 00:00:00 2001 From: IIEleven11 Date: Fri, 2 Aug 2024 21:39:13 -0700 Subject: [PATCH 4/5] Fixed merge of vocab.json and new tokenizer for custom dataset Fixed vocab.json merge, new tokenizer for custom dataset, dataset cleaner. --- finetune.py | 14 +--- system/ft_tokenizer/compare_and_merge.py | 74 ++++++++++--------- system/ft_tokenizer/custom_tokenizer.py | 60 +++++++++++++++ .../extract_dataset_for_tokenizer.py | 26 +++++++ 4 files changed, 127 insertions(+), 47 deletions(-) create mode 100644 system/ft_tokenizer/custom_tokenizer.py create mode 100644 system/ft_tokenizer/extract_dataset_for_tokenizer.py diff --git a/finetune.py b/finetune.py index b622baff..96f1b8bf 100644 --- a/finetune.py +++ b/finetune.py @@ -918,7 +918,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, print(f"[FINETUNE] Learning Scheduler {lr_scheduler}, params {lr_scheduler_params}") - # training parameters config config = GPTTrainerConfig( epochs=num_epochs, @@ -951,8 +950,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, lr_scheduler=lr_scheduler, # it was adjusted accordly for the new step scheme lr_scheduler_params=lr_scheduler_params, - test_sentences=[], - ) progress(0, desc="Model is currently training. See console for more information") # init the model from config model = GPTTrainer.init_from_config(config) @@ -2374,13 +2371,4 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n fn=delete_voice_sample_contents, outputs=[final_progress_data], ) - model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None) - - demo.queue().launch( - show_api=False, - inbrowser=True, - share=False, - debug=False, - server_port=7052, - server_name="127.0.0.1", - ) + model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None) \ No newline at end of file diff --git a/system/ft_tokenizer/compare_and_merge.py b/system/ft_tokenizer/compare_and_merge.py index dd1c4534..58d5b3a3 100644 --- a/system/ft_tokenizer/compare_and_merge.py +++ b/system/ft_tokenizer/compare_and_merge.py @@ -1,48 +1,54 @@ import json import os -# Define file paths -base_path = "/alltalk_tts/system/ft_tokenizer/" -original_file = os.path.join(base_path, "/alltalk_tts/models/xtts/xttsv2_2.0.3/vocab.json") # base model vocab.json -new_file = os.path.join(base_path, "/expanded_model/bpe_tokenizer-vocab.json") # new bpe tokenizer vocab.json -output_file = os.path.join(base_path, "/expanded_model/expanded_vocab.json") # output expanded_vocab.json file/path +def merge_vocabularies(base_vocab_path, new_vocab_path, output_path): + # Load the base model's vocab.json + with open(base_vocab_path, 'r') as f: + base_data = json.load(f) + + # Load the new bpe_tokenizer.json + with open(new_vocab_path, 'r') as f: + new_data = json.load(f) + # Extract the vocabularies + base_vocab = base_data['model']['vocab'] + new_vocab = new_data['model']['vocab'] -with open(original_file, 'r') as f: - original_data = json.load(f) + # Find the maximum value in the base vocabulary + max_value = max(base_vocab.values()) -with open(new_file, 'r') as f: - new_data = json.load(f) + # Merge the vocabularies + for key, value in new_vocab.items(): + if key not in base_vocab: + max_value += 1 + base_vocab[key] = max_value -# Get the original and new vocabularies -original_vocab = original_data['model']['vocab'] -new_vocab = new_data['model']['vocab'] + # Update the base data with the merged vocabulary + base_data['model']['vocab'] = base_vocab -# Find the maximum value in the original vocabulary. We do this so we make sure to append and not overwrite any new values -max_value = max(original_vocab.values()) + # Extract the merges + base_merges = base_data['model']['merges'] + new_merges = new_data['model']['merges'] -# merge -for key, value in new_vocab.items(): - if key not in original_vocab: - max_value += 1 - original_vocab[key] = max_value + # Merge the merges + merged_merges = base_merges.copy() + for merge in new_merges: + if merge not in merged_merges: + merged_merges.append(merge) -# Update -original_data['model']['vocab'] = original_vocab + # Update the base data with the merged merges + base_data['model']['merges'] = merged_merges -# Get the original and new merges -original_merges = original_data['model']['merges'] -new_merges = new_data['model']['merges'] + # Write the merged vocabulary and merges to the output file + with open(output_path, 'w') as f: + json.dump(base_data, f, ensure_ascii=False, indent=2) -# Merge the merges -merged_merges = original_merges.copy() -for merge in new_merges: - if merge not in merged_merges: - merged_merges.append(merge) + print(f"Merged vocabulary and merges saved to {output_path}") -# Update -original_data['model']['merges'] = merged_merges -with open(output_file, 'w') as f: - json.dump(original_data, f, ensure_ascii=False, indent=2) +# Define file paths +base_vocab_path = "/alltalk_tts/models/xtts/xttsv2_2.0.2/originalvocab.json" # base model vocab.json path (2.0.2) +new_vocab_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # path to the custom dataset vocab.json +output_path = "/alltalk_tts/expanded_models/combined_vocab.json" # location for combined vocab.json -print(f"Merged vocabulary and merges saved to {output_file}") +# Merge the vocabularies +merge_vocabularies(base_vocab_path, new_vocab_path, output_path) \ No newline at end of file diff --git a/system/ft_tokenizer/custom_tokenizer.py b/system/ft_tokenizer/custom_tokenizer.py new file mode 100644 index 00000000..43624252 --- /dev/null +++ b/system/ft_tokenizer/custom_tokenizer.py @@ -0,0 +1,60 @@ +# Credit Jarod Mica https://github.com/JarodMica/tortoise_dataset_tools/blob/master/bpe_tokenizer_tools/train_bpe_tokenizer.py +# provide a cleaned txt file with only the transcription. Youll get back out the datasets vocab.json + +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer +from tokenizers.pre_tokenizers import Whitespace +from tkinter import Tk, filedialog +import json +import re + +def clean_text(input_file_path, output_file_path): + # Define the pattern to match numbers, specific symbols, and new lines + # add \d to match any digit, and | is used to specify alternatives + pattern = r'|�|«|\$|\n' + + with open(input_file_path, 'r', encoding='utf-8') as input_file: + text = input_file.read() + cleaned_text = re.sub(pattern, '', text) + + with open(output_file_path, 'w', encoding='utf-8') as output_file: + output_file.write(cleaned_text) + +def train_tokenizer(input_path, tokenizer_path, language, special_tokens=["[STOP]", "[UNK]", "[SPACE]" ], vocab_size=256): + # Initialize a tokenizer with the BPE model + tokenizer = Tokenizer(BPE(unk_token="[UNK]")) + # Use a basic whitespace pre-tokenizer + tokenizer.pre_tokenizer = Whitespace() + + # trainer = BpeTrainer(special_tokens=["[STOP]", "[UNK]", "[SPACE]", "0","1","2","3","4","5","6","7","8","9",], vocab_size=256) + trainer = BpeTrainer(special_tokens=special_tokens, vocab_size=vocab_size) + + + clean_text(input_path, input_path) + tokenizer.train([input_path], trainer) + + tokenizer.save(tokenizer_path) + + with open(tokenizer_path, 'r', encoding='utf-8') as f: + tokenizer_json = json.load(f) + + # Add language to tokenizer + tokenizer_json['model']['language'] = language + + with open(tokenizer_path, 'w', encoding='utf-8') as f: + json.dump(tokenizer_json, f, ensure_ascii=False, indent=4) + +def choose_file(): + root = Tk() + root.withdraw() + file = filedialog.askopenfilename() + root.destroy() + return file + +if __name__ == "__main__": + input_path = choose_file() + tokenizer_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # define path to put the newly create vocab.json + special_tokens = ["[STOP]", "[UNK]", "[SPACE]"] + vocab_size = 256 # model is stuck at this size + train_tokenizer(input_path, tokenizer_path, language='multi', special_tokens=special_tokens, vocab_size=vocab_size) \ No newline at end of file diff --git a/system/ft_tokenizer/extract_dataset_for_tokenizer.py b/system/ft_tokenizer/extract_dataset_for_tokenizer.py new file mode 100644 index 00000000..36dee0c5 --- /dev/null +++ b/system/ft_tokenizer/extract_dataset_for_tokenizer.py @@ -0,0 +1,26 @@ +# Simple script to remove LJ Speech formatting for the tokenizer +# Combine metadata_train and metadata_eval.csv into a single file then run +import csv + +# Input and output file names +input_file = '/alltalkbeta/metadata_eval.csv' # combine metadata_train and metadata_eval.csv +output_file = '/alltalkbeta/dataset.txt' # this goes to the tokenizer + +# Read the input CSV and write to the output file +with open(input_file, 'r', newline='', encoding='utf-8') as infile, \ + open(output_file, 'w', newline='', encoding='utf-8') as outfile: + + # Create CSV reader and writer objects + reader = csv.reader(infile, delimiter='|') + writer = csv.writer(outfile, delimiter='|') + + # Skip the header + next(reader, None) + + # Process each row + for row in reader: + if len(row) >= 2: + # Write only the second column (index 1) to the output file + writer.writerow([row[1]]) + +print(f"Processing complete. Output written to {output_file}") \ No newline at end of file From aafae8da82803857f3f0b6e289d22d552eeae2d1 Mon Sep 17 00:00:00 2001 From: IIEleven11 Date: Thu, 19 Sep 2024 21:11:31 -0700 Subject: [PATCH 5/5] Fixed slurred speech by freezing all layers but the two embedding layers --- system/ft_tokenizer/expand_xtts.py | 53 ++++++++++++++---------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/system/ft_tokenizer/expand_xtts.py b/system/ft_tokenizer/expand_xtts.py index 2c79f0c0..832fa7ba 100644 --- a/system/ft_tokenizer/expand_xtts.py +++ b/system/ft_tokenizer/expand_xtts.py @@ -43,20 +43,13 @@ def get_vocab_size(tokenizer_path): # Function to adjust the pretrained model with a new tokenizer def adjust_pretrained_model( - pretrained_model_path, adjusted_model_path, new_tokenizer_path -): - # Load the pretrained model state dictionary + pretrained_model_path, adjusted_model_path, new_tokenizer_path): state_dict = torch.load(pretrained_model_path) pretrained_state_dict = state_dict["model"] - - # Create a new Xtts model instance with the loaded configuration model = Xtts(config) # Load the pretrained state dictionary into the new model - missing_keys, unexpected_keys = model.load_state_dict( - pretrained_state_dict, strict=False - ) - # Print any missing or unexpected keys for debugging + missing_keys, unexpected_keys = model.load_state_dict(pretrained_state_dict, strict=False) if missing_keys: print(f"Missing keys: {missing_keys}") if unexpected_keys: @@ -71,20 +64,18 @@ def adjust_pretrained_model( new_vocab_size = len(new_tokenizer.tokenizer.get_vocab()) embedding_dim = model.gpt.text_embedding.embedding_dim - # Print vocabulary sizes and embedding dimension for debugging print(f"Old vocab size: {old_vocab_size}") print(f"New vocab size: {new_vocab_size}") print(f"Embedding dimension: {embedding_dim}") # Adjust the embedding layer with the new vocabulary size - adjust_embedding_layer( - model, pretrained_state_dict, new_vocab_size, adjusted_model_path - ) - -# Function to adjust the embedding layer for the new vocabulary size -def adjust_embedding_layer( - model, pretrained_state_dict, new_vocab_size, adjusted_model_path -): + adjust_embedding_layer(model, new_vocab_size, adjusted_model_path) + + # Freeze all parameters except the position embeddings + freeze_except_position_embeddings(model) + + # Function to adjust the embedding layer for the new vocabulary size +def adjust_embedding_layer(model, new_vocab_size, adjusted_model_path): old_vocab_size = model.gpt.text_embedding.num_embeddings embedding_dim = model.gpt.text_embedding.embedding_dim @@ -94,33 +85,37 @@ def adjust_embedding_layer( # Copy weights from the old embedding layer to the new one if new_vocab_size > old_vocab_size: - # If the new vocabulary is larger, copy existing weights and initialize new ones - new_text_embedding.weight.data[:old_vocab_size] = ( - model.gpt.text_embedding.weight.data - ) + new_text_embedding.weight.data[:old_vocab_size] = model.gpt.text_embedding.weight.data new_text_head.weight.data[:old_vocab_size] = model.gpt.text_head.weight.data new_text_head.bias.data[:old_vocab_size] = model.gpt.text_head.bias.data - # Initialize new weights with normal distribution new_text_embedding.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02) new_text_head.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02) new_text_head.bias.data[old_vocab_size:].normal_(mean=0.0, std=0.02) else: - # If the new vocabulary is smaller, truncate the existing weights - new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[ - :new_vocab_size - ] + new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[:new_vocab_size] new_text_head.weight.data = model.gpt.text_head.weight.data[:new_vocab_size] new_text_head.bias.data = model.gpt.text_head.bias.data[:new_vocab_size] - # Replace the old embedding layer with the new one model.gpt.text_embedding = new_text_embedding model.gpt.text_head = new_text_head - # Save the adjusted model checkpoint = {"model": model.state_dict()} torch.save(checkpoint, adjusted_model_path) print(f"Adjusted model saved to {adjusted_model_path}") +# Function to freeze all parameters except the position embeddings +def freeze_except_position_embeddings(model): + for param in model.parameters(): + param.requires_grad = False + + for name, param in model.named_parameters(): + if 'pos_embedding' in name: + param.requires_grad = True + + # Verify which parameters are frozen and which are not, comment this out if you dont want to debug. You should see only two true values + for name, param in model.named_parameters(): + print(f"{name}: requires_grad={param.requires_grad}") + # Expand the pretrained model with the new tokenizer adjust_pretrained_model(pretrained_model_path, expanded_model_path, new_tokenizer_path)