feat: add ctc segmentation tool to everyvoice

EveryVoiceTTS · Nov 11, 2023 · e828331 · e828331
1 parent ef0fc9a
commit e828331
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 4 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning"]
 	path = everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning
 	url = ../HiFiGAN_iSTFT_lightning.git
+[submodule "everyvoice/model/aligner/wav2vec2aligner"]
+	path = everyvoice/model/aligner/wav2vec2aligner
+	url = ../wav2vec2aligner.git
diff --git a/everyvoice/__init__.py b/everyvoice/__init__.py
@@ -11,9 +11,12 @@
 sys.path.append(
     os.path.join(parent_folder_path, "model", "aligner", "DeepForcedAligner")
 )
+sys.path.append(os.path.join(parent_folder_path, "model", "aligner", "wav2vec2aligner"))
 sys.path.append(
-    os.path.join(parent_folder_path, "model", "aligner", "FastSpeech2_lightning")
+    os.path.join(
+        parent_folder_path, "model", "feature_prediction", "FastSpeech2_lightning"
+    )
 )
 sys.path.append(
-    os.path.join(parent_folder_path, "model", "aligner", "HiFiGAN_iSTFT_lightning")
+    os.path.join(parent_folder_path, "model", "vocoder", "HiFiGAN_iSTFT_lightning")
 )
diff --git a/everyvoice/cli.py b/everyvoice/cli.py
@@ -10,6 +10,9 @@
 from everyvoice.config.preprocessing_config import PreprocessingConfig
 from everyvoice.config.text_config import TextConfig
 from everyvoice.model.aligner.config import AlignerConfig
+from everyvoice.model.aligner.wav2vec2aligner.aligner.cli import (
+    align_single as ctc_segment,
+)
 from everyvoice.model.e2e.config import EveryVoiceConfig
 from everyvoice.model.feature_prediction.config import FeaturePredictionConfig
 from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.cli import (
@@ -56,6 +59,11 @@ def list_commands(self, ctx):
 
     To run the new dataset wizard please use the following command: everyvoice new-dataset
 
+    ## Segment long files in your dataset
+
+    If you have long audio files that contain more than one utterance,
+    you can use the segmentation tool by running everyvoice segment [OPTIONS]
+
     ## Preprocess
 
     Once you have a configuration, preprocess your data by running everyvoice preprocess [OPTIONS]
@@ -81,6 +89,18 @@ class ModelTypes(str, Enum):
     spec_to_wav = "spec-to-wav"
 
 
+app.command(
+    short_help="Segment a long audio file",
+    name="segment",
+    help="""
+    # Segmentation help
+
+    This command will segment a long audio file into multiple utterances which is required for training a TTS system.
+    This command should work on most languages and you should run it before running the new dataset or preprocessing steps.
+    """,
+)(ctc_segment)
+
+
 @app.command(
     short_help="This command will help you create all the configuration necessary for using a new dataset.",
     help="""

diff --git a/everyvoice/model/aligner/wav2vec2aligner b/everyvoice/model/aligner/wav2vec2aligner
diff --git a/requirements.torch.txt b/requirements.torch.txt
@@ -1,4 +1,4 @@
 # these requirements have to be installed ahead of time in your environment and from a different URL:
 # CUDA_TAG=cu117 pip install -r requirements.torch.txt --find-links https://download.pytorch.org/whl/torch_stable.html
-torch==2.0.1+${CUDA_TAG}
-torchaudio==2.0.2+${CUDA_TAG}
+torch==2.1.0+${CUDA_TAG}
+torchaudio==2.1.0+${CUDA_TAG}