erew123 · erew123 · Oct 20, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/finetune.py b/finetune.py
@@ -920,7 +920,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
 
 
  print(f"[FINETUNE] Learning Scheduler {lr_scheduler}, params {lr_scheduler_params}")
-
  # training parameters config
  config = GPTTrainerConfig(
  epochs=num_epochs,
@@ -953,8 +952,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
  lr_scheduler=lr_scheduler,
  # it was adjusted accordly for the new step scheme
  lr_scheduler_params=lr_scheduler_params,
- test_sentences=[],
- )
  progress(0, desc="Model is currently training. See console for more information")
  # init the model from config
  model = GPTTrainer.init_from_config(config)
@@ -2384,14 +2381,4 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n
  fn=delete_voice_sample_contents,
  outputs=[final_progress_data],
  )
- model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
-
- demo.queue().launch(
- show_api=False,
- inbrowser=True,
- share=False,
- debug=False,
- server_port=7052,
- server_name="127.0.0.1",
- )
-
+ model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
diff --git a/system/ft_tokenizer/compare_and_merge.py b/system/ft_tokenizer/compare_and_merge.py
@@ -0,0 +1,54 @@
+import json
+import os
+
+def merge_vocabularies(base_vocab_path, new_vocab_path, output_path):
+ # Load the base model's vocab.json
+ with open(base_vocab_path, 'r') as f:
+ base_data = json.load(f)
+
+ # Load the new bpe_tokenizer.json
+ with open(new_vocab_path, 'r') as f:
+ new_data = json.load(f)
+
+ # Extract the vocabularies
+ base_vocab = base_data['model']['vocab']
+ new_vocab = new_data['model']['vocab']
+
+ # Find the maximum value in the base vocabulary
+ max_value = max(base_vocab.values())
+
+ # Merge the vocabularies
+ for key, value in new_vocab.items():
+ if key not in base_vocab:
+ max_value += 1
+ base_vocab[key] = max_value
+
+ # Update the base data with the merged vocabulary
+ base_data['model']['vocab'] = base_vocab
+
+ # Extract the merges
+ base_merges = base_data['model']['merges']
+ new_merges = new_data['model']['merges']
+
+ # Merge the merges
+ merged_merges = base_merges.copy()
+ for merge in new_merges:
+ if merge not in merged_merges:
+ merged_merges.append(merge)
+
+ # Update the base data with the merged merges
+ base_data['model']['merges'] = merged_merges
+
+ # Write the merged vocabulary and merges to the output file
+ with open(output_path, 'w') as f:
+ json.dump(base_data, f, ensure_ascii=False, indent=2)
+
+ print(f"Merged vocabulary and merges saved to {output_path}")
+
+# Define file paths
+base_vocab_path = "/alltalk_tts/models/xtts/xttsv2_2.0.2/originalvocab.json" # base model vocab.json path (2.0.2)
+new_vocab_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # path to the custom dataset vocab.json
+output_path = "/alltalk_tts/expanded_models/combined_vocab.json" # location for combined vocab.json
+
+# Merge the vocabularies
+merge_vocabularies(base_vocab_path, new_vocab_path, output_path)
diff --git a/system/ft_tokenizer/custom_tokenizer.py b/system/ft_tokenizer/custom_tokenizer.py
@@ -0,0 +1,60 @@
+# Credit Jarod Mica https://github.com/JarodMica/tortoise_dataset_tools/blob/master/bpe_tokenizer_tools/train_bpe_tokenizer.py
+# provide a cleaned txt file with only the transcription. Youll get back out the datasets vocab.json
+
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+from tkinter import Tk, filedialog
+import json
+import re
+
+def clean_text(input_file_path, output_file_path):
+ # Define the pattern to match numbers, specific symbols, and new lines
+ # add \d to match any digit, and | is used to specify alternatives
+ pattern = r'|�|«|\$|\n'
+
+ with open(input_file_path, 'r', encoding='utf-8') as input_file:
+ text = input_file.read()
+ cleaned_text = re.sub(pattern, '', text)
+
+ with open(output_file_path, 'w', encoding='utf-8') as output_file:
+ output_file.write(cleaned_text)
+
+def train_tokenizer(input_path, tokenizer_path, language, special_tokens=["[STOP]", "[UNK]", "[SPACE]" ], vocab_size=256):
+ # Initialize a tokenizer with the BPE model
+ tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+ # Use a basic whitespace pre-tokenizer
+ tokenizer.pre_tokenizer = Whitespace()
+
+ # trainer = BpeTrainer(special_tokens=["[STOP]", "[UNK]", "[SPACE]", "0","1","2","3","4","5","6","7","8","9",], vocab_size=256)
+ trainer = BpeTrainer(special_tokens=special_tokens, vocab_size=vocab_size)
+
+
+ clean_text(input_path, input_path)
+ tokenizer.train([input_path], trainer)
+
+ tokenizer.save(tokenizer_path)
+
+ with open(tokenizer_path, 'r', encoding='utf-8') as f:
+ tokenizer_json = json.load(f)
+
+ # Add language to tokenizer
+ tokenizer_json['model']['language'] = language
+
+ with open(tokenizer_path, 'w', encoding='utf-8') as f:
+ json.dump(tokenizer_json, f, ensure_ascii=False, indent=4)
+
+def choose_file():
+ root = Tk()
+ root.withdraw()
+ file = filedialog.askopenfilename()
+ root.destroy()
+ return file
+
+if __name__ == "__main__":
+ input_path = choose_file()
+ tokenizer_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # define path to put the newly create vocab.json
+ special_tokens = ["[STOP]", "[UNK]", "[SPACE]"]
+ vocab_size = 256 # model is stuck at this size
+ train_tokenizer(input_path, tokenizer_path, language='multi', special_tokens=special_tokens, vocab_size=vocab_size)
diff --git a/system/ft_tokenizer/expand_xtts.py b/system/ft_tokenizer/expand_xtts.py
@@ -0,0 +1,121 @@
+"""
+This script does not have any specific alltalk integration yet. So we are manually entering paths and it works as a standalone.
+
+The script does the following:
+ - Expands the embedding layer of the base XTTSv2 model according to the user created/trained bpe_tokenizer-vocab.json and the base model vocab.json.
+Set variable paths with the base config and base model.pth. 
+Set the new tokenizer/vocab bpe_tokenizer-vocab.json location.
+The new model will be saved at \expanded_models\expanded_model.pth
+
+Once this is done the new expanded model must be swapped in for the base model.pth then combined with the dvae.pth, vocab.json(bpe_tokenizer-vocab.json), base model config.json,
+base model speaker_xtts.pth and base model vocoder.json.
+
+
+I left some print debug statements in the script, they may be nice for the user to see during the process.
+
+"""
+
+import torch
+import torch.nn as nn
+import json
+from TTS.tts.models.xtts import Xtts
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTTrainerConfig
+from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
+
+config_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/config.json" # Path to the base model config.json
+pretrained_model_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/model.pth" # Path to the base model.pth
+new_tokenizer_path = "/expanded_model/expanded_vocab.json" # Path to the new combined expanded_vocab.json
+expanded_model_path = "/expanded_model/expanded_model.pth" # Path to where you want the new expanded_model.pth
+
+
+# Open and load the configuration file
+with open(config_path, "r") as f:
+ config_dict = json.load(f)
+
+# Create a GPTTrainerConfig object and populate it with the loaded configuration
+config = GPTTrainerConfig()
+config.from_dict(data=config_dict)
+
+# Function to get the vocabulary size from a tokenizer file
+def get_vocab_size(tokenizer_path):
+ tokenizer = VoiceBpeTokenizer(vocab_file=tokenizer_path)
+ return len(tokenizer.tokenizer.get_vocab())
+
+# Function to adjust the pretrained model with a new tokenizer
+def adjust_pretrained_model(
+ pretrained_model_path, adjusted_model_path, new_tokenizer_path):
+ state_dict = torch.load(pretrained_model_path)
+ pretrained_state_dict = state_dict["model"]
+ model = Xtts(config)
+
+ # Load the pretrained state dictionary into the new model
+ missing_keys, unexpected_keys = model.load_state_dict(pretrained_state_dict, strict=False)
+ if missing_keys:
+ print(f"Missing keys: {missing_keys}")
+ if unexpected_keys:
+ print(f"Unexpected keys: {unexpected_keys}")
+ print("Pretrained model loaded successfully.")
+
+ # Create a new tokenizer with the new vocabulary
+ new_tokenizer = VoiceBpeTokenizer(vocab_file=new_tokenizer_path)
+
+ # Get the old and new vocabulary sizes, and the embedding dimension
+ old_vocab_size = model.gpt.text_embedding.num_embeddings
+ new_vocab_size = len(new_tokenizer.tokenizer.get_vocab())
+ embedding_dim = model.gpt.text_embedding.embedding_dim
+
+ print(f"Old vocab size: {old_vocab_size}")
+ print(f"New vocab size: {new_vocab_size}")
+ print(f"Embedding dimension: {embedding_dim}")
+
+ # Adjust the embedding layer with the new vocabulary size
+ adjust_embedding_layer(model, new_vocab_size, adjusted_model_path)
+
+ # Freeze all parameters except the position embeddings
+ freeze_except_position_embeddings(model)
+
+ # Function to adjust the embedding layer for the new vocabulary size
+def adjust_embedding_layer(model, new_vocab_size, adjusted_model_path):
+ old_vocab_size = model.gpt.text_embedding.num_embeddings
+ embedding_dim = model.gpt.text_embedding.embedding_dim
+
+ # Create new embedding and linear layers with the new vocabulary size
+ new_text_embedding = nn.Embedding(new_vocab_size, embedding_dim)
+ new_text_head = nn.Linear(embedding_dim, new_vocab_size)
+
+ # Copy weights from the old embedding layer to the new one
+ if new_vocab_size > old_vocab_size:
+ new_text_embedding.weight.data[:old_vocab_size] = model.gpt.text_embedding.weight.data
+ new_text_head.weight.data[:old_vocab_size] = model.gpt.text_head.weight.data
+ new_text_head.bias.data[:old_vocab_size] = model.gpt.text_head.bias.data
+
+ new_text_embedding.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
+ new_text_head.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
+ new_text_head.bias.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
+ else:
+ new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[:new_vocab_size]
+ new_text_head.weight.data = model.gpt.text_head.weight.data[:new_vocab_size]
+ new_text_head.bias.data = model.gpt.text_head.bias.data[:new_vocab_size]
+
+ model.gpt.text_embedding = new_text_embedding
+ model.gpt.text_head = new_text_head
+
+ checkpoint = {"model": model.state_dict()}
+ torch.save(checkpoint, adjusted_model_path)
+ print(f"Adjusted model saved to {adjusted_model_path}")
+
+# Function to freeze all parameters except the position embeddings
+def freeze_except_position_embeddings(model):
+ for param in model.parameters():
+ param.requires_grad = False
+
+ for name, param in model.named_parameters():
+ if 'pos_embedding' in name:
+ param.requires_grad = True
+
+ # Verify which parameters are frozen and which are not, comment this out if you dont want to debug. You should see only two true values
+ for name, param in model.named_parameters():
+ print(f"{name}: requires_grad={param.requires_grad}")
+
+# Expand the pretrained model with the new tokenizer
+adjust_pretrained_model(pretrained_model_path, expanded_model_path, new_tokenizer_path)
diff --git a/system/ft_tokenizer/extract_dataset_for_tokenizer.py b/system/ft_tokenizer/extract_dataset_for_tokenizer.py
@@ -0,0 +1,26 @@
+# Simple script to remove LJ Speech formatting for the tokenizer
+# Combine metadata_train and metadata_eval.csv into a single file then run
+import csv
+
+# Input and output file names
+input_file = '/alltalkbeta/metadata_eval.csv' # combine metadata_train and metadata_eval.csv
+output_file = '/alltalkbeta/dataset.txt' # this goes to the tokenizer
+
+# Read the input CSV and write to the output file
+with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
+ open(output_file, 'w', newline='', encoding='utf-8') as outfile:
+
+ # Create CSV reader and writer objects
+ reader = csv.reader(infile, delimiter='|')
+ writer = csv.writer(outfile, delimiter='|')
+
+ # Skip the header
+ next(reader, None)
+
+ # Process each row
+ for row in reader:
+ if len(row) >= 2:
+ # Write only the second column (index 1) to the output file
+ writer.writerow([row[1]])
+
+print(f"Processing complete. Output written to {output_file}")