From d44d721d6920297a0d3d06bd27c3a372613a80bb Mon Sep 17 00:00:00 2001
From: IIEleven11 <jakemottola6@gmail.com>
Date: Wed, 31 Jul 2024 18:30:47 -0700
Subject: [PATCH 1/5] Tokenizer changes

---
 finetune.py                              |  32 ++++--
 start_alltalk.sh                         |   4 +
 start_diagnostics.sh                     |   4 +
 start_environment.sh                     |  16 +++
 start_finetune.sh                        |   5 +
 system/ft_tokenizer/compare_and_merge.py |  48 +++++++++
 system/ft_tokenizer/expand_xtts.py       | 126 +++++++++++++++++++++++
 7 files changed, 227 insertions(+), 8 deletions(-)
 create mode 100755 start_alltalk.sh
 create mode 100755 start_diagnostics.sh
 create mode 100755 start_environment.sh
 create mode 100755 start_finetune.sh
 create mode 100644 system/ft_tokenizer/compare_and_merge.py
 create mode 100644 system/ft_tokenizer/expand_xtts.py

diff --git a/finetune.py b/finetune.py
index f4b11f8b..073bf71f 100644
--- a/finetune.py
+++ b/finetune.py
@@ -2376,12 +2376,28 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n
             )
             model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
 
-    demo.queue().launch(
-        show_api=False,
-        inbrowser=True,
-        share=False,
-        debug=False,
-        server_port=7052,
-        server_name="127.0.0.1",
-    )
+import random
+def find_available_port(start_port, end_port):
+    ports = list(range(start_port, end_port + 1))
+    random.shuffle(ports)
+    return ports
+
+ports_to_try = find_available_port(7800, 7810)
+
+for port in ports_to_try:
+    try:
+        demo.queue().launch(
+            show_api=False,
+            inbrowser=True,
+            share=False,
+            debug=False,
+            server_port=port,
+            server_name="127.0.0.1",
+        )
+        print(f"Successfully launched on port {port}")
+        break
+    except OSError:
+        print(f"Port {port} is not available, trying next...")
+else:
+    print("No available ports in the specified range.")
 
diff --git a/start_alltalk.sh b/start_alltalk.sh
new file mode 100755
index 00000000..55e0756c
--- /dev/null
+++ b/start_alltalk.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh"
+conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
+python script.py
diff --git a/start_diagnostics.sh b/start_diagnostics.sh
new file mode 100755
index 00000000..c5586879
--- /dev/null
+++ b/start_diagnostics.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh"
+conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
+python diagnostics.py
diff --git a/start_environment.sh b/start_environment.sh
new file mode 100755
index 00000000..9791ff36
--- /dev/null
+++ b/start_environment.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+cd "."
+if [[ "/home/eleven/alltalkbeta/alltalk_tts" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
+# deactivate existing conda envs as needed to avoid conflicts
+{ conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null
+# config
+CONDA_ROOT_PREFIX="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda"
+INSTALL_ENV_DIR="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
+# environment isolation
+export PYTHONNOUSERSITE=1
+unset PYTHONPATH
+unset PYTHONHOME
+export CUDA_PATH="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
+export CUDA_HOME="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
+# activate env
+bash --init-file <(echo "source \"/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh\" && conda activate \"/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env\"")
diff --git a/start_finetune.sh b/start_finetune.sh
new file mode 100755
index 00000000..c09d800b
--- /dev/null
+++ b/start_finetune.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+export TRAINER_TELEMETRY=0
+source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh"
+conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
+python finetune.py
diff --git a/system/ft_tokenizer/compare_and_merge.py b/system/ft_tokenizer/compare_and_merge.py
new file mode 100644
index 00000000..dd1c4534
--- /dev/null
+++ b/system/ft_tokenizer/compare_and_merge.py
@@ -0,0 +1,48 @@
+import json
+import os
+
+# Define file paths
+base_path = "/alltalk_tts/system/ft_tokenizer/"
+original_file = os.path.join(base_path, "/alltalk_tts/models/xtts/xttsv2_2.0.3/vocab.json") # base model vocab.json
+new_file = os.path.join(base_path, "/expanded_model/bpe_tokenizer-vocab.json")  # new bpe tokenizer vocab.json
+output_file = os.path.join(base_path, "/expanded_model/expanded_vocab.json")  # output expanded_vocab.json file/path
+
+
+with open(original_file, 'r') as f:
+    original_data = json.load(f)
+
+with open(new_file, 'r') as f:
+    new_data = json.load(f)
+
+# Get the original and new vocabularies
+original_vocab = original_data['model']['vocab']
+new_vocab = new_data['model']['vocab']
+
+# Find the maximum value in the original vocabulary. We do this so we make sure to append and not overwrite any new values
+max_value = max(original_vocab.values())
+
+# merge
+for key, value in new_vocab.items():
+    if key not in original_vocab:
+        max_value += 1
+        original_vocab[key] = max_value
+
+# Update
+original_data['model']['vocab'] = original_vocab
+
+# Get the original and new merges
+original_merges = original_data['model']['merges']
+new_merges = new_data['model']['merges']
+
+# Merge the merges
+merged_merges = original_merges.copy()
+for merge in new_merges:
+    if merge not in merged_merges:
+        merged_merges.append(merge)
+
+# Update
+original_data['model']['merges'] = merged_merges
+with open(output_file, 'w') as f:
+    json.dump(original_data, f, ensure_ascii=False, indent=2)
+
+print(f"Merged vocabulary and merges saved to {output_file}")
diff --git a/system/ft_tokenizer/expand_xtts.py b/system/ft_tokenizer/expand_xtts.py
new file mode 100644
index 00000000..2c79f0c0
--- /dev/null
+++ b/system/ft_tokenizer/expand_xtts.py
@@ -0,0 +1,126 @@
+"""
+This script does not have any specific alltalk integration yet. So we are manually entering paths and it works as a standalone.
+
+The script does the following:
+    - Expands the embedding layer of the base XTTSv2 model according to the user created/trained bpe_tokenizer-vocab.json and the base model vocab.json.
+Set variable paths with the base config and base model.pth. 
+Set the new tokenizer/vocab bpe_tokenizer-vocab.json location.
+The new model will be saved at \expanded_models\expanded_model.pth
+
+Once this is done the new expanded model must be swapped in for the base model.pth then combined with the dvae.pth, vocab.json(bpe_tokenizer-vocab.json), base model config.json,
+base model speaker_xtts.pth and base model vocoder.json.
+
+
+I left some print debug statements in the script, they may be nice for the user to see during the process.
+
+"""
+
+import torch
+import torch.nn as nn
+import json
+from TTS.tts.models.xtts import Xtts
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTTrainerConfig
+from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
+
+config_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/config.json"                        # Path to the base model config.json
+pretrained_model_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/model.pth"                # Path to the base model.pth
+new_tokenizer_path = "/expanded_model/expanded_vocab.json"                               # Path to the new combined expanded_vocab.json
+expanded_model_path = "/expanded_model/expanded_model.pth"                               # Path to where you want the new expanded_model.pth
+
+
+# Open and load the configuration file
+with open(config_path, "r") as f:
+    config_dict = json.load(f)
+
+# Create a GPTTrainerConfig object and populate it with the loaded configuration
+config = GPTTrainerConfig()
+config.from_dict(data=config_dict)
+
+# Function to get the vocabulary size from a tokenizer file
+def get_vocab_size(tokenizer_path):
+    tokenizer = VoiceBpeTokenizer(vocab_file=tokenizer_path)
+    return len(tokenizer.tokenizer.get_vocab())
+
+# Function to adjust the pretrained model with a new tokenizer
+def adjust_pretrained_model(
+    pretrained_model_path, adjusted_model_path, new_tokenizer_path
+):
+    # Load the pretrained model state dictionary
+    state_dict = torch.load(pretrained_model_path)
+    pretrained_state_dict = state_dict["model"]
+
+    # Create a new Xtts model instance with the loaded configuration
+    model = Xtts(config)
+
+    # Load the pretrained state dictionary into the new model
+    missing_keys, unexpected_keys = model.load_state_dict(
+        pretrained_state_dict, strict=False
+    )
+    # Print any missing or unexpected keys for debugging
+    if missing_keys:
+        print(f"Missing keys: {missing_keys}")
+    if unexpected_keys:
+        print(f"Unexpected keys: {unexpected_keys}")
+    print("Pretrained model loaded successfully.")
+
+    # Create a new tokenizer with the new vocabulary
+    new_tokenizer = VoiceBpeTokenizer(vocab_file=new_tokenizer_path)
+
+    # Get the old and new vocabulary sizes, and the embedding dimension
+    old_vocab_size = model.gpt.text_embedding.num_embeddings
+    new_vocab_size = len(new_tokenizer.tokenizer.get_vocab())
+    embedding_dim = model.gpt.text_embedding.embedding_dim
+
+    # Print vocabulary sizes and embedding dimension for debugging
+    print(f"Old vocab size: {old_vocab_size}")
+    print(f"New vocab size: {new_vocab_size}")
+    print(f"Embedding dimension: {embedding_dim}")
+
+    # Adjust the embedding layer with the new vocabulary size
+    adjust_embedding_layer(
+        model, pretrained_state_dict, new_vocab_size, adjusted_model_path
+    )
+
+# Function to adjust the embedding layer for the new vocabulary size
+def adjust_embedding_layer(
+    model, pretrained_state_dict, new_vocab_size, adjusted_model_path
+):
+    old_vocab_size = model.gpt.text_embedding.num_embeddings
+    embedding_dim = model.gpt.text_embedding.embedding_dim
+
+    # Create new embedding and linear layers with the new vocabulary size
+    new_text_embedding = nn.Embedding(new_vocab_size, embedding_dim)
+    new_text_head = nn.Linear(embedding_dim, new_vocab_size)
+
+    # Copy weights from the old embedding layer to the new one
+    if new_vocab_size > old_vocab_size:
+        # If the new vocabulary is larger, copy existing weights and initialize new ones
+        new_text_embedding.weight.data[:old_vocab_size] = (
+            model.gpt.text_embedding.weight.data
+        )
+        new_text_head.weight.data[:old_vocab_size] = model.gpt.text_head.weight.data
+        new_text_head.bias.data[:old_vocab_size] = model.gpt.text_head.bias.data
+
+        # Initialize new weights with normal distribution
+        new_text_embedding.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
+        new_text_head.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
+        new_text_head.bias.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
+    else:
+        # If the new vocabulary is smaller, truncate the existing weights
+        new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[
+            :new_vocab_size
+        ]
+        new_text_head.weight.data = model.gpt.text_head.weight.data[:new_vocab_size]
+        new_text_head.bias.data = model.gpt.text_head.bias.data[:new_vocab_size]
+
+    # Replace the old embedding layer with the new one
+    model.gpt.text_embedding = new_text_embedding
+    model.gpt.text_head = new_text_head
+
+    # Save the adjusted model
+    checkpoint = {"model": model.state_dict()}
+    torch.save(checkpoint, adjusted_model_path)
+    print(f"Adjusted model saved to {adjusted_model_path}")
+
+# Expand the pretrained model with the new tokenizer
+adjust_pretrained_model(pretrained_model_path, expanded_model_path, new_tokenizer_path)

From 7e2d3455d2234c6807afdf627124de8cfbf6ee2e Mon Sep 17 00:00:00 2001
From: IIEleven11 <jakemottola6@gmail.com>
Date: Thu, 1 Aug 2024 00:05:42 -0700
Subject: [PATCH 2/5] removed unecessary scripts and finetune.py edit

---
 finetune.py          | 26 --------------------------
 start_alltalk.sh     |  4 ----
 start_diagnostics.sh |  4 ----
 start_environment.sh | 16 ----------------
 start_finetune.sh    |  5 -----
 5 files changed, 55 deletions(-)
 delete mode 100755 start_alltalk.sh
 delete mode 100755 start_diagnostics.sh
 delete mode 100755 start_environment.sh
 delete mode 100755 start_finetune.sh

diff --git a/finetune.py b/finetune.py
index 073bf71f..ea7286ed 100644
--- a/finetune.py
+++ b/finetune.py
@@ -2375,29 +2375,3 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n
                 outputs=[final_progress_data],
             )
             model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
-
-import random
-def find_available_port(start_port, end_port):
-    ports = list(range(start_port, end_port + 1))
-    random.shuffle(ports)
-    return ports
-
-ports_to_try = find_available_port(7800, 7810)
-
-for port in ports_to_try:
-    try:
-        demo.queue().launch(
-            show_api=False,
-            inbrowser=True,
-            share=False,
-            debug=False,
-            server_port=port,
-            server_name="127.0.0.1",
-        )
-        print(f"Successfully launched on port {port}")
-        break
-    except OSError:
-        print(f"Port {port} is not available, trying next...")
-else:
-    print("No available ports in the specified range.")
-
diff --git a/start_alltalk.sh b/start_alltalk.sh
deleted file mode 100755
index 55e0756c..00000000
--- a/start_alltalk.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh"
-conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
-python script.py
diff --git a/start_diagnostics.sh b/start_diagnostics.sh
deleted file mode 100755
index c5586879..00000000
--- a/start_diagnostics.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh"
-conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
-python diagnostics.py
diff --git a/start_environment.sh b/start_environment.sh
deleted file mode 100755
index 9791ff36..00000000
--- a/start_environment.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-cd "."
-if [[ "/home/eleven/alltalkbeta/alltalk_tts" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
-# deactivate existing conda envs as needed to avoid conflicts
-{ conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null
-# config
-CONDA_ROOT_PREFIX="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda"
-INSTALL_ENV_DIR="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
-# environment isolation
-export PYTHONNOUSERSITE=1
-unset PYTHONPATH
-unset PYTHONHOME
-export CUDA_PATH="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
-export CUDA_HOME="/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
-# activate env
-bash --init-file <(echo "source \"/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh\" && conda activate \"/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env\"")
diff --git a/start_finetune.sh b/start_finetune.sh
deleted file mode 100755
index c09d800b..00000000
--- a/start_finetune.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-export TRAINER_TELEMETRY=0
-source "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/conda/etc/profile.d/conda.sh"
-conda activate "/home/eleven/alltalkbeta/alltalk_tts/alltalk_environment/env"
-python finetune.py

From 9047dcb13986d2fbaf052b082f0be5f211ad8523 Mon Sep 17 00:00:00 2001
From: IIEleven11 <jakemottola6@gmail.com>
Date: Thu, 1 Aug 2024 00:07:46 -0700
Subject: [PATCH 3/5] removed unecessary scripts and finetune.py edit

---
 finetune.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/finetune.py b/finetune.py
index ea7286ed..b622baff 100644
--- a/finetune.py
+++ b/finetune.py
@@ -2375,3 +2375,12 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n
                 outputs=[final_progress_data],
             )
             model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
+
+    demo.queue().launch(
+        show_api=False,
+        inbrowser=True,
+        share=False,
+        debug=False,
+        server_port=7052,
+        server_name="127.0.0.1",
+    )

From 44bd45a6241ba07199088a63b823faf239125e12 Mon Sep 17 00:00:00 2001
From: IIEleven11 <jakemottola6@gmail.com>
Date: Fri, 2 Aug 2024 21:39:13 -0700
Subject: [PATCH 4/5] Fixed merge of vocab.json and new tokenizer for custom
 dataset

Fixed vocab.json merge, new tokenizer for custom dataset, dataset cleaner.
---
 finetune.py                                   | 14 +---
 system/ft_tokenizer/compare_and_merge.py      | 74 ++++++++++---------
 system/ft_tokenizer/custom_tokenizer.py       | 60 +++++++++++++++
 .../extract_dataset_for_tokenizer.py          | 26 +++++++
 4 files changed, 127 insertions(+), 47 deletions(-)
 create mode 100644 system/ft_tokenizer/custom_tokenizer.py
 create mode 100644 system/ft_tokenizer/extract_dataset_for_tokenizer.py

diff --git a/finetune.py b/finetune.py
index b622baff..96f1b8bf 100644
--- a/finetune.py
+++ b/finetune.py
@@ -918,7 +918,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
 
 
     print(f"[FINETUNE] Learning Scheduler {lr_scheduler}, params {lr_scheduler_params}")
-
     # training parameters config
     config = GPTTrainerConfig(
         epochs=num_epochs,
@@ -951,8 +950,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
         lr_scheduler=lr_scheduler,
         # it was adjusted accordly for the new step scheme
         lr_scheduler_params=lr_scheduler_params,
-        test_sentences=[],
-    )
     progress(0, desc="Model is currently training. See console for more information")
     # init the model from config
     model = GPTTrainer.init_from_config(config)
@@ -2374,13 +2371,4 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n
                 fn=delete_voice_sample_contents,
                 outputs=[final_progress_data],
             )
-            model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
-
-    demo.queue().launch(
-        show_api=False,
-        inbrowser=True,
-        share=False,
-        debug=False,
-        server_port=7052,
-        server_name="127.0.0.1",
-    )
+            model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
\ No newline at end of file
diff --git a/system/ft_tokenizer/compare_and_merge.py b/system/ft_tokenizer/compare_and_merge.py
index dd1c4534..58d5b3a3 100644
--- a/system/ft_tokenizer/compare_and_merge.py
+++ b/system/ft_tokenizer/compare_and_merge.py
@@ -1,48 +1,54 @@
 import json
 import os
 
-# Define file paths
-base_path = "/alltalk_tts/system/ft_tokenizer/"
-original_file = os.path.join(base_path, "/alltalk_tts/models/xtts/xttsv2_2.0.3/vocab.json") # base model vocab.json
-new_file = os.path.join(base_path, "/expanded_model/bpe_tokenizer-vocab.json")  # new bpe tokenizer vocab.json
-output_file = os.path.join(base_path, "/expanded_model/expanded_vocab.json")  # output expanded_vocab.json file/path
+def merge_vocabularies(base_vocab_path, new_vocab_path, output_path):
+    # Load the base model's vocab.json
+    with open(base_vocab_path, 'r') as f:
+        base_data = json.load(f)
+
+    # Load the new bpe_tokenizer.json
+    with open(new_vocab_path, 'r') as f:
+        new_data = json.load(f)
 
+    # Extract the vocabularies
+    base_vocab = base_data['model']['vocab']
+    new_vocab = new_data['model']['vocab']
 
-with open(original_file, 'r') as f:
-    original_data = json.load(f)
+    # Find the maximum value in the base vocabulary
+    max_value = max(base_vocab.values())
 
-with open(new_file, 'r') as f:
-    new_data = json.load(f)
+    # Merge the vocabularies
+    for key, value in new_vocab.items():
+        if key not in base_vocab:
+            max_value += 1
+            base_vocab[key] = max_value
 
-# Get the original and new vocabularies
-original_vocab = original_data['model']['vocab']
-new_vocab = new_data['model']['vocab']
+    # Update the base data with the merged vocabulary
+    base_data['model']['vocab'] = base_vocab
 
-# Find the maximum value in the original vocabulary. We do this so we make sure to append and not overwrite any new values
-max_value = max(original_vocab.values())
+    # Extract the merges
+    base_merges = base_data['model']['merges']
+    new_merges = new_data['model']['merges']
 
-# merge
-for key, value in new_vocab.items():
-    if key not in original_vocab:
-        max_value += 1
-        original_vocab[key] = max_value
+    # Merge the merges
+    merged_merges = base_merges.copy()
+    for merge in new_merges:
+        if merge not in merged_merges:
+            merged_merges.append(merge)
 
-# Update
-original_data['model']['vocab'] = original_vocab
+    # Update the base data with the merged merges
+    base_data['model']['merges'] = merged_merges
 
-# Get the original and new merges
-original_merges = original_data['model']['merges']
-new_merges = new_data['model']['merges']
+    # Write the merged vocabulary and merges to the output file
+    with open(output_path, 'w') as f:
+        json.dump(base_data, f, ensure_ascii=False, indent=2)
 
-# Merge the merges
-merged_merges = original_merges.copy()
-for merge in new_merges:
-    if merge not in merged_merges:
-        merged_merges.append(merge)
+    print(f"Merged vocabulary and merges saved to {output_path}")
 
-# Update
-original_data['model']['merges'] = merged_merges
-with open(output_file, 'w') as f:
-    json.dump(original_data, f, ensure_ascii=False, indent=2)
+# Define file paths
+base_vocab_path = "/alltalk_tts/models/xtts/xttsv2_2.0.2/originalvocab.json" # base model vocab.json path (2.0.2)
+new_vocab_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # path to the custom dataset vocab.json
+output_path = "/alltalk_tts/expanded_models/combined_vocab.json" # location for combined vocab.json
 
-print(f"Merged vocabulary and merges saved to {output_file}")
+# Merge the vocabularies
+merge_vocabularies(base_vocab_path, new_vocab_path, output_path)
\ No newline at end of file
diff --git a/system/ft_tokenizer/custom_tokenizer.py b/system/ft_tokenizer/custom_tokenizer.py
new file mode 100644
index 00000000..43624252
--- /dev/null
+++ b/system/ft_tokenizer/custom_tokenizer.py
@@ -0,0 +1,60 @@
+# Credit Jarod Mica https://github.com/JarodMica/tortoise_dataset_tools/blob/master/bpe_tokenizer_tools/train_bpe_tokenizer.py
+# provide a cleaned txt file with only the transcription. Youll get back out the datasets vocab.json
+
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+from tkinter import Tk, filedialog
+import json
+import re
+
+def clean_text(input_file_path, output_file_path):
+    # Define the pattern to match numbers, specific symbols, and new lines
+    # add \d to match any digit, and | is used to specify alternatives
+    pattern = r'|�|«|\$|\n'
+
+    with open(input_file_path, 'r', encoding='utf-8') as input_file:
+        text = input_file.read()
+        cleaned_text = re.sub(pattern, '', text)
+
+    with open(output_file_path, 'w', encoding='utf-8') as output_file:
+        output_file.write(cleaned_text)
+
+def train_tokenizer(input_path, tokenizer_path, language, special_tokens=["[STOP]", "[UNK]", "[SPACE]" ], vocab_size=256):
+    # Initialize a tokenizer with the BPE model
+    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+    # Use a basic whitespace pre-tokenizer
+    tokenizer.pre_tokenizer = Whitespace()
+
+    # trainer = BpeTrainer(special_tokens=["[STOP]", "[UNK]", "[SPACE]", "0","1","2","3","4","5","6","7","8","9",], vocab_size=256)
+    trainer = BpeTrainer(special_tokens=special_tokens, vocab_size=vocab_size)
+    
+
+    clean_text(input_path, input_path)
+    tokenizer.train([input_path], trainer)
+
+    tokenizer.save(tokenizer_path)
+
+    with open(tokenizer_path, 'r', encoding='utf-8') as f:
+        tokenizer_json = json.load(f)
+
+    # Add language to tokenizer
+    tokenizer_json['model']['language'] = language
+
+    with open(tokenizer_path, 'w', encoding='utf-8') as f:
+        json.dump(tokenizer_json, f, ensure_ascii=False, indent=4)
+        
+def choose_file():
+    root = Tk()
+    root.withdraw()
+    file = filedialog.askopenfilename()
+    root.destroy()
+    return file
+    
+if __name__ == "__main__":
+    input_path = choose_file()
+    tokenizer_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # define path to put the newly create vocab.json
+    special_tokens = ["[STOP]", "[UNK]", "[SPACE]"]
+    vocab_size = 256 # model is stuck at this size
+    train_tokenizer(input_path, tokenizer_path, language='multi', special_tokens=special_tokens, vocab_size=vocab_size)
\ No newline at end of file
diff --git a/system/ft_tokenizer/extract_dataset_for_tokenizer.py b/system/ft_tokenizer/extract_dataset_for_tokenizer.py
new file mode 100644
index 00000000..36dee0c5
--- /dev/null
+++ b/system/ft_tokenizer/extract_dataset_for_tokenizer.py
@@ -0,0 +1,26 @@
+# Simple script to remove LJ Speech formatting for the tokenizer
+# Combine metadata_train and metadata_eval.csv into a single file then run
+import csv
+
+# Input and output file names
+input_file = '/alltalkbeta/metadata_eval.csv'  # combine metadata_train and metadata_eval.csv
+output_file = '/alltalkbeta/dataset.txt' # this goes to the tokenizer
+
+# Read the input CSV and write to the output file
+with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
+     open(output_file, 'w', newline='', encoding='utf-8') as outfile:
+    
+    # Create CSV reader and writer objects
+    reader = csv.reader(infile, delimiter='|')
+    writer = csv.writer(outfile, delimiter='|')
+    
+    # Skip the header
+    next(reader, None)
+    
+    # Process each row
+    for row in reader:
+        if len(row) >= 2:
+            # Write only the second column (index 1) to the output file
+            writer.writerow([row[1]])
+
+print(f"Processing complete. Output written to {output_file}")
\ No newline at end of file

From aafae8da82803857f3f0b6e289d22d552eeae2d1 Mon Sep 17 00:00:00 2001
From: IIEleven11 <jakemottola6@gmail.com>
Date: Thu, 19 Sep 2024 21:11:31 -0700
Subject: [PATCH 5/5] Fixed slurred speech by freezing all layers but the two
 embedding layers

---
 system/ft_tokenizer/expand_xtts.py | 53 ++++++++++++++----------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/system/ft_tokenizer/expand_xtts.py b/system/ft_tokenizer/expand_xtts.py
index 2c79f0c0..832fa7ba 100644
--- a/system/ft_tokenizer/expand_xtts.py
+++ b/system/ft_tokenizer/expand_xtts.py
@@ -43,20 +43,13 @@ def get_vocab_size(tokenizer_path):
 
 # Function to adjust the pretrained model with a new tokenizer
 def adjust_pretrained_model(
-    pretrained_model_path, adjusted_model_path, new_tokenizer_path
-):
-    # Load the pretrained model state dictionary
+    pretrained_model_path, adjusted_model_path, new_tokenizer_path):
     state_dict = torch.load(pretrained_model_path)
     pretrained_state_dict = state_dict["model"]
-
-    # Create a new Xtts model instance with the loaded configuration
     model = Xtts(config)
 
     # Load the pretrained state dictionary into the new model
-    missing_keys, unexpected_keys = model.load_state_dict(
-        pretrained_state_dict, strict=False
-    )
-    # Print any missing or unexpected keys for debugging
+    missing_keys, unexpected_keys = model.load_state_dict(pretrained_state_dict, strict=False)
     if missing_keys:
         print(f"Missing keys: {missing_keys}")
     if unexpected_keys:
@@ -71,20 +64,18 @@ def adjust_pretrained_model(
     new_vocab_size = len(new_tokenizer.tokenizer.get_vocab())
     embedding_dim = model.gpt.text_embedding.embedding_dim
 
-    # Print vocabulary sizes and embedding dimension for debugging
     print(f"Old vocab size: {old_vocab_size}")
     print(f"New vocab size: {new_vocab_size}")
     print(f"Embedding dimension: {embedding_dim}")
 
     # Adjust the embedding layer with the new vocabulary size
-    adjust_embedding_layer(
-        model, pretrained_state_dict, new_vocab_size, adjusted_model_path
-    )
-
-# Function to adjust the embedding layer for the new vocabulary size
-def adjust_embedding_layer(
-    model, pretrained_state_dict, new_vocab_size, adjusted_model_path
-):
+    adjust_embedding_layer(model, new_vocab_size, adjusted_model_path)
+
+    # Freeze all parameters except the position embeddings
+    freeze_except_position_embeddings(model)
+
+    # Function to adjust the embedding layer for the new vocabulary size
+def adjust_embedding_layer(model, new_vocab_size, adjusted_model_path):
     old_vocab_size = model.gpt.text_embedding.num_embeddings
     embedding_dim = model.gpt.text_embedding.embedding_dim
 
@@ -94,33 +85,37 @@ def adjust_embedding_layer(
 
     # Copy weights from the old embedding layer to the new one
     if new_vocab_size > old_vocab_size:
-        # If the new vocabulary is larger, copy existing weights and initialize new ones
-        new_text_embedding.weight.data[:old_vocab_size] = (
-            model.gpt.text_embedding.weight.data
-        )
+        new_text_embedding.weight.data[:old_vocab_size] = model.gpt.text_embedding.weight.data
         new_text_head.weight.data[:old_vocab_size] = model.gpt.text_head.weight.data
         new_text_head.bias.data[:old_vocab_size] = model.gpt.text_head.bias.data
 
-        # Initialize new weights with normal distribution
         new_text_embedding.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
         new_text_head.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
         new_text_head.bias.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
     else:
-        # If the new vocabulary is smaller, truncate the existing weights
-        new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[
-            :new_vocab_size
-        ]
+        new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[:new_vocab_size]
         new_text_head.weight.data = model.gpt.text_head.weight.data[:new_vocab_size]
         new_text_head.bias.data = model.gpt.text_head.bias.data[:new_vocab_size]
 
-    # Replace the old embedding layer with the new one
     model.gpt.text_embedding = new_text_embedding
     model.gpt.text_head = new_text_head
 
-    # Save the adjusted model
     checkpoint = {"model": model.state_dict()}
     torch.save(checkpoint, adjusted_model_path)
     print(f"Adjusted model saved to {adjusted_model_path}")
 
+# Function to freeze all parameters except the position embeddings
+def freeze_except_position_embeddings(model):
+    for param in model.parameters():
+        param.requires_grad = False
+
+    for name, param in model.named_parameters():
+        if 'pos_embedding' in name:
+            param.requires_grad = True
+
+    # Verify which parameters are frozen and which are not, comment this out if you dont want to debug. You should see only two true values
+    for name, param in model.named_parameters():
+        print(f"{name}: requires_grad={param.requires_grad}")
+
 # Expand the pretrained model with the new tokenizer
 adjust_pretrained_model(pretrained_model_path, expanded_model_path, new_tokenizer_path)