diff --git a/finetune.py b/finetune.py index f675c266..d5742feb 100644 --- a/finetune.py +++ b/finetune.py @@ -920,7 +920,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, print(f"[FINETUNE] Learning Scheduler {lr_scheduler}, params {lr_scheduler_params}") - # training parameters config config = GPTTrainerConfig( epochs=num_epochs, @@ -953,8 +952,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, lr_scheduler=lr_scheduler, # it was adjusted accordly for the new step scheme lr_scheduler_params=lr_scheduler_params, - test_sentences=[], - ) progress(0, desc="Model is currently training. See console for more information") # init the model from config model = GPTTrainer.init_from_config(config) @@ -2384,14 +2381,4 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n fn=delete_voice_sample_contents, outputs=[final_progress_data], ) - model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None) - - demo.queue().launch( - show_api=False, - inbrowser=True, - share=False, - debug=False, - server_port=7052, - server_name="127.0.0.1", - ) - + model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None) \ No newline at end of file diff --git a/system/ft_tokenizer/compare_and_merge.py b/system/ft_tokenizer/compare_and_merge.py new file mode 100644 index 00000000..58d5b3a3 --- /dev/null +++ b/system/ft_tokenizer/compare_and_merge.py @@ -0,0 +1,54 @@ +import json +import os + +def merge_vocabularies(base_vocab_path, new_vocab_path, output_path): + # Load the base model's vocab.json + with open(base_vocab_path, 'r') as f: + base_data = json.load(f) + + # Load the new bpe_tokenizer.json + with open(new_vocab_path, 'r') as f: + new_data = json.load(f) + + # Extract the vocabularies + base_vocab = base_data['model']['vocab'] + new_vocab = new_data['model']['vocab'] + + # Find the maximum value in the base vocabulary + max_value = max(base_vocab.values()) + + # Merge the vocabularies + for key, value in new_vocab.items(): + if key not in base_vocab: + max_value += 1 + base_vocab[key] = max_value + + # Update the base data with the merged vocabulary + base_data['model']['vocab'] = base_vocab + + # Extract the merges + base_merges = base_data['model']['merges'] + new_merges = new_data['model']['merges'] + + # Merge the merges + merged_merges = base_merges.copy() + for merge in new_merges: + if merge not in merged_merges: + merged_merges.append(merge) + + # Update the base data with the merged merges + base_data['model']['merges'] = merged_merges + + # Write the merged vocabulary and merges to the output file + with open(output_path, 'w') as f: + json.dump(base_data, f, ensure_ascii=False, indent=2) + + print(f"Merged vocabulary and merges saved to {output_path}") + +# Define file paths +base_vocab_path = "/alltalk_tts/models/xtts/xttsv2_2.0.2/originalvocab.json" # base model vocab.json path (2.0.2) +new_vocab_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # path to the custom dataset vocab.json +output_path = "/alltalk_tts/expanded_models/combined_vocab.json" # location for combined vocab.json + +# Merge the vocabularies +merge_vocabularies(base_vocab_path, new_vocab_path, output_path) \ No newline at end of file diff --git a/system/ft_tokenizer/custom_tokenizer.py b/system/ft_tokenizer/custom_tokenizer.py new file mode 100644 index 00000000..43624252 --- /dev/null +++ b/system/ft_tokenizer/custom_tokenizer.py @@ -0,0 +1,60 @@ +# Credit Jarod Mica https://github.com/JarodMica/tortoise_dataset_tools/blob/master/bpe_tokenizer_tools/train_bpe_tokenizer.py +# provide a cleaned txt file with only the transcription. Youll get back out the datasets vocab.json + +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer +from tokenizers.pre_tokenizers import Whitespace +from tkinter import Tk, filedialog +import json +import re + +def clean_text(input_file_path, output_file_path): + # Define the pattern to match numbers, specific symbols, and new lines + # add \d to match any digit, and | is used to specify alternatives + pattern = r'|�|«|\$|\n' + + with open(input_file_path, 'r', encoding='utf-8') as input_file: + text = input_file.read() + cleaned_text = re.sub(pattern, '', text) + + with open(output_file_path, 'w', encoding='utf-8') as output_file: + output_file.write(cleaned_text) + +def train_tokenizer(input_path, tokenizer_path, language, special_tokens=["[STOP]", "[UNK]", "[SPACE]" ], vocab_size=256): + # Initialize a tokenizer with the BPE model + tokenizer = Tokenizer(BPE(unk_token="[UNK]")) + # Use a basic whitespace pre-tokenizer + tokenizer.pre_tokenizer = Whitespace() + + # trainer = BpeTrainer(special_tokens=["[STOP]", "[UNK]", "[SPACE]", "0","1","2","3","4","5","6","7","8","9",], vocab_size=256) + trainer = BpeTrainer(special_tokens=special_tokens, vocab_size=vocab_size) + + + clean_text(input_path, input_path) + tokenizer.train([input_path], trainer) + + tokenizer.save(tokenizer_path) + + with open(tokenizer_path, 'r', encoding='utf-8') as f: + tokenizer_json = json.load(f) + + # Add language to tokenizer + tokenizer_json['model']['language'] = language + + with open(tokenizer_path, 'w', encoding='utf-8') as f: + json.dump(tokenizer_json, f, ensure_ascii=False, indent=4) + +def choose_file(): + root = Tk() + root.withdraw() + file = filedialog.askopenfilename() + root.destroy() + return file + +if __name__ == "__main__": + input_path = choose_file() + tokenizer_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # define path to put the newly create vocab.json + special_tokens = ["[STOP]", "[UNK]", "[SPACE]"] + vocab_size = 256 # model is stuck at this size + train_tokenizer(input_path, tokenizer_path, language='multi', special_tokens=special_tokens, vocab_size=vocab_size) \ No newline at end of file diff --git a/system/ft_tokenizer/expand_xtts.py b/system/ft_tokenizer/expand_xtts.py new file mode 100644 index 00000000..832fa7ba --- /dev/null +++ b/system/ft_tokenizer/expand_xtts.py @@ -0,0 +1,121 @@ +""" +This script does not have any specific alltalk integration yet. So we are manually entering paths and it works as a standalone. + +The script does the following: + - Expands the embedding layer of the base XTTSv2 model according to the user created/trained bpe_tokenizer-vocab.json and the base model vocab.json. +Set variable paths with the base config and base model.pth. +Set the new tokenizer/vocab bpe_tokenizer-vocab.json location. +The new model will be saved at \expanded_models\expanded_model.pth + +Once this is done the new expanded model must be swapped in for the base model.pth then combined with the dvae.pth, vocab.json(bpe_tokenizer-vocab.json), base model config.json, +base model speaker_xtts.pth and base model vocoder.json. + + +I left some print debug statements in the script, they may be nice for the user to see during the process. + +""" + +import torch +import torch.nn as nn +import json +from TTS.tts.models.xtts import Xtts +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTTrainerConfig +from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer + +config_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/config.json" # Path to the base model config.json +pretrained_model_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/model.pth" # Path to the base model.pth +new_tokenizer_path = "/expanded_model/expanded_vocab.json" # Path to the new combined expanded_vocab.json +expanded_model_path = "/expanded_model/expanded_model.pth" # Path to where you want the new expanded_model.pth + + +# Open and load the configuration file +with open(config_path, "r") as f: + config_dict = json.load(f) + +# Create a GPTTrainerConfig object and populate it with the loaded configuration +config = GPTTrainerConfig() +config.from_dict(data=config_dict) + +# Function to get the vocabulary size from a tokenizer file +def get_vocab_size(tokenizer_path): + tokenizer = VoiceBpeTokenizer(vocab_file=tokenizer_path) + return len(tokenizer.tokenizer.get_vocab()) + +# Function to adjust the pretrained model with a new tokenizer +def adjust_pretrained_model( + pretrained_model_path, adjusted_model_path, new_tokenizer_path): + state_dict = torch.load(pretrained_model_path) + pretrained_state_dict = state_dict["model"] + model = Xtts(config) + + # Load the pretrained state dictionary into the new model + missing_keys, unexpected_keys = model.load_state_dict(pretrained_state_dict, strict=False) + if missing_keys: + print(f"Missing keys: {missing_keys}") + if unexpected_keys: + print(f"Unexpected keys: {unexpected_keys}") + print("Pretrained model loaded successfully.") + + # Create a new tokenizer with the new vocabulary + new_tokenizer = VoiceBpeTokenizer(vocab_file=new_tokenizer_path) + + # Get the old and new vocabulary sizes, and the embedding dimension + old_vocab_size = model.gpt.text_embedding.num_embeddings + new_vocab_size = len(new_tokenizer.tokenizer.get_vocab()) + embedding_dim = model.gpt.text_embedding.embedding_dim + + print(f"Old vocab size: {old_vocab_size}") + print(f"New vocab size: {new_vocab_size}") + print(f"Embedding dimension: {embedding_dim}") + + # Adjust the embedding layer with the new vocabulary size + adjust_embedding_layer(model, new_vocab_size, adjusted_model_path) + + # Freeze all parameters except the position embeddings + freeze_except_position_embeddings(model) + + # Function to adjust the embedding layer for the new vocabulary size +def adjust_embedding_layer(model, new_vocab_size, adjusted_model_path): + old_vocab_size = model.gpt.text_embedding.num_embeddings + embedding_dim = model.gpt.text_embedding.embedding_dim + + # Create new embedding and linear layers with the new vocabulary size + new_text_embedding = nn.Embedding(new_vocab_size, embedding_dim) + new_text_head = nn.Linear(embedding_dim, new_vocab_size) + + # Copy weights from the old embedding layer to the new one + if new_vocab_size > old_vocab_size: + new_text_embedding.weight.data[:old_vocab_size] = model.gpt.text_embedding.weight.data + new_text_head.weight.data[:old_vocab_size] = model.gpt.text_head.weight.data + new_text_head.bias.data[:old_vocab_size] = model.gpt.text_head.bias.data + + new_text_embedding.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02) + new_text_head.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02) + new_text_head.bias.data[old_vocab_size:].normal_(mean=0.0, std=0.02) + else: + new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[:new_vocab_size] + new_text_head.weight.data = model.gpt.text_head.weight.data[:new_vocab_size] + new_text_head.bias.data = model.gpt.text_head.bias.data[:new_vocab_size] + + model.gpt.text_embedding = new_text_embedding + model.gpt.text_head = new_text_head + + checkpoint = {"model": model.state_dict()} + torch.save(checkpoint, adjusted_model_path) + print(f"Adjusted model saved to {adjusted_model_path}") + +# Function to freeze all parameters except the position embeddings +def freeze_except_position_embeddings(model): + for param in model.parameters(): + param.requires_grad = False + + for name, param in model.named_parameters(): + if 'pos_embedding' in name: + param.requires_grad = True + + # Verify which parameters are frozen and which are not, comment this out if you dont want to debug. You should see only two true values + for name, param in model.named_parameters(): + print(f"{name}: requires_grad={param.requires_grad}") + +# Expand the pretrained model with the new tokenizer +adjust_pretrained_model(pretrained_model_path, expanded_model_path, new_tokenizer_path) diff --git a/system/ft_tokenizer/extract_dataset_for_tokenizer.py b/system/ft_tokenizer/extract_dataset_for_tokenizer.py new file mode 100644 index 00000000..36dee0c5 --- /dev/null +++ b/system/ft_tokenizer/extract_dataset_for_tokenizer.py @@ -0,0 +1,26 @@ +# Simple script to remove LJ Speech formatting for the tokenizer +# Combine metadata_train and metadata_eval.csv into a single file then run +import csv + +# Input and output file names +input_file = '/alltalkbeta/metadata_eval.csv' # combine metadata_train and metadata_eval.csv +output_file = '/alltalkbeta/dataset.txt' # this goes to the tokenizer + +# Read the input CSV and write to the output file +with open(input_file, 'r', newline='', encoding='utf-8') as infile, \ + open(output_file, 'w', newline='', encoding='utf-8') as outfile: + + # Create CSV reader and writer objects + reader = csv.reader(infile, delimiter='|') + writer = csv.writer(outfile, delimiter='|') + + # Skip the header + next(reader, None) + + # Process each row + for row in reader: + if len(row) >= 2: + # Write only the second column (index 1) to the output file + writer.writerow([row[1]]) + +print(f"Processing complete. Output written to {output_file}") \ No newline at end of file