idiap · eginhard · Oct 4, 2024 · May 30, 2024 · May 30, 2024 · May 30, 2024
diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
@@ -8,6 +8,7 @@ defaults:
  bash
 jobs:
  build-sdist:
+ name: Build source distribution
  runs-on: ubuntu-latest
  steps:
  - uses: actions/checkout@v4
@@ -23,37 +24,31 @@ jobs:
  with:
  python-version: 3.9
  - run: |
- python -m pip install -U pip setuptools wheel build
+ python -m pip install -U pip setuptools build
  - run: |
  python -m build
  - run: |
  pip install dist/*.tar.gz
  - uses: actions/upload-artifact@v4
  with:
- name: sdist
+ name: build-sdist
  path: dist/*.tar.gz
  build-wheels:
- runs-on: ubuntu-latest
+ name: Build wheels on ${{ matrix.os }}
+ runs-on: ${{ matrix.os }}
  strategy:
  matrix:
- python-version: ["3.9", "3.10", "3.11", "3.12"]
+ os: [ubuntu-latest, windows-latest, macos-latest]
  steps:
  - uses: actions/checkout@v4
- - uses: actions/setup-python@v5
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install build requirements
- run: |
- python -m pip install -U pip setuptools wheel build numpy cython
- - name: Setup and install manylinux1_x86_64 wheel
- run: |
- python setup.py bdist_wheel --plat-name=manylinux1_x86_64
- python -m pip install dist/*-manylinux*.whl
+ - name: Build wheels
+ uses: pypa/cibuildwheel@v2.21.1
  - uses: actions/upload-artifact@v4
  with:
- name: wheel-${{ matrix.python-version }}
- path: dist/*-manylinux*.whl
+ name: build-wheels-${{ matrix.os }}
+ path: ./wheelhouse/*.whl
  publish-artifacts:
+ name: Publish to PyPI
  runs-on: ubuntu-latest
  needs: [build-sdist, build-wheels]
  environment:
@@ -62,28 +57,11 @@ jobs:
  permissions:
  id-token: write
  steps:
- - run: |
- mkdir dist
- - uses: actions/download-artifact@v4
- with:
- name: "sdist"
- path: "dist/"
- - uses: actions/download-artifact@v4
- with:
- name: "wheel-3.9"
- path: "dist/"
- - uses: actions/download-artifact@v4
- with:
- name: "wheel-3.10"
- path: "dist/"
- - uses: actions/download-artifact@v4
- with:
- name: "wheel-3.11"
- path: "dist/"
  - uses: actions/download-artifact@v4
  with:
- name: "wheel-3.12"
- path: "dist/"
+ path: dist
+ pattern: build-*
+ merge-multiple: true
  - run: |
  ls -lh dist/
  - name: Publish package distributions to PyPI

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -45,13 +45,17 @@ jobs:
  sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
  - name: Install TTS
  run: |
- python3 -m uv pip install --system "coqui-tts[dev,server,languages] @ ."
- python3 setup.py egg_info
+ resolution=highest
+ if [ "${{ matrix.python-version }}" == "3.9" ]; then
+ resolution=lowest-direct
+ fi
+ python3 -m uv pip install --resolution=$resolution --system "coqui-tts[dev,server,languages] @ ."
  - name: Unit tests
  run: make ${{ matrix.subset }}
  - name: Upload coverage data
  uses: actions/upload-artifact@v4
  with:
+ include-hidden-files: true
  name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
  path: .coverage.*
  if-no-files-found: ignore

diff --git a/README.md b/README.md
@@ -4,10 +4,10 @@
 - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
 - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech).
 - 📣 ⓍTTS can now stream with <200ms latency.
-- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/dev/models/xtts.html)
-- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/dev/models/bark.html)
+- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/latest/models/xtts.html)
+- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
-- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://coqui-tts.readthedocs.io/en/dev/models/tortoise.html)
+- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/tortoise.html)
 
 <div align="center">
 <img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
@@ -55,6 +55,10 @@ Please use our dedicated channels for questions and discussion. Help is much mor
 [discord]: https://discord.gg/5eXr5seRrv
 [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
 
+The [issues](https://github.com/coqui-ai/TTS/issues) and
+[discussions](https://github.com/coqui-ai/TTS/discussions) in the original
+repository are also still a useful source of information.
+
 
 ## 🔗 Links and Resources
 | Type | Links |
@@ -143,6 +147,7 @@ If you plan to code or train models, clone 🐸TTS and install it locally.
 
 ```bash
 git clone https://github.com/idiap/coqui-ai-TTS
+cd coqui-ai-TTS
 pip install -e .
 ```
 

diff --git a/TTS/.models.json b/TTS/.models.json
@@ -48,7 +48,6 @@
  "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
  "https://coqui.gateway.scarf.sh/hf/bark/text_2.pt",
  "https://coqui.gateway.scarf.sh/hf/bark/config.json",
- "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
  "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
  ],
  "default_vocoder": null,

diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
@@ -8,14 +8,14 @@
 import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
+from trainer.io import load_checkpoint
 
 from TTS.config import load_config
 from TTS.tts.datasets.TTSDataset import TTSDataset
 from TTS.tts.models import setup_model
 from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
-from TTS.utils.io import load_checkpoint
 
 if __name__ == "__main__":
  setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
@@ -35,7 +35,7 @@
  --data_path /root/LJSpeech-1.1/
  --batch_size 32
  --dataset ljspeech
- --use_cuda True
+ --use_cuda
 """,
  formatter_class=RawTextHelpFormatter,
  )
@@ -62,7 +62,7 @@
  help="Dataset metafile inclusing file paths with transcripts.",
  )
  parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
- parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
+ parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.")
 
  parser.add_argument(
  "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."

diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
@@ -150,7 +150,7 @@ def compute_embeddings(
  default=False,
  action="store_true",
  )
- parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
+ parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
  parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
  parser.add_argument(
  "--formatter_name",

diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
@@ -75,8 +75,8 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
  type=str,
  help="Path to dataset config file.",
  )
- parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
- parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+ parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True)
+ parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
 
  args = parser.parse_args()
 

diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
@@ -282,7 +282,7 @@ def main(args): # pylint: disable=redefined-outer-name
  parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
  parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
  parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
- parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+ parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
  args = parser.parse_args()
 
  c = load_config(args.config_path)

diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
@@ -80,7 +80,7 @@ def preprocess_audios():
  setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
 
  parser = argparse.ArgumentParser(
- description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
  )
  parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
  parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
@@ -95,20 +95,20 @@ def preprocess_audios():
  parser.add_argument(
  "-t",
  "--trim_just_beginning_and_end",
- type=bool,
+ action=argparse.BooleanOptionalAction,
  default=True,
- help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.",
  )
  parser.add_argument(
  "-c",
  "--use_cuda",
- type=bool,
+ action=argparse.BooleanOptionalAction,
  default=False,
  help="If True use cuda",
  )
  parser.add_argument(
  "--use_onnx",
- type=bool,
+ action=argparse.BooleanOptionalAction,
  default=False,
  help="If True use onnx",
  )

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
+
+"""Command line interface."""
 
 import argparse
 import contextlib
@@ -136,30 +137,16 @@
 """
 
 
-def str2bool(v):
- if isinstance(v, bool):
- return v
- if v.lower() in ("yes", "true", "t", "y", "1"):
- return True
- if v.lower() in ("no", "false", "f", "n", "0"):
- return False
- raise argparse.ArgumentTypeError("Boolean value expected.")
-
-
-def main():
- setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
+def parse_args() -> argparse.Namespace:
+ """Parse arguments."""
  parser = argparse.ArgumentParser(
  description=description.replace(" ```\n", ""),
  formatter_class=RawTextHelpFormatter,
  )
 
  parser.add_argument(
  "--list_models",
- type=str2bool,
- nargs="?",
- const=True,
- default=False,
+ action="store_true",
  help="list available pre-trained TTS and vocoder models.",
  )
 
@@ -207,7 +194,7 @@ def main():
  default="tts_output.wav",
  help="Output wav file path.",
  )
- parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
+ parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.")
  parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
  parser.add_argument(
  "--vocoder_path",
@@ -226,10 +213,7 @@ def main():
  parser.add_argument(
  "--pipe_out",
  help="stdout the generated TTS wav file for shell pipe.",
- type=str2bool,
- nargs="?",
- const=True,
- default=False,
+ action="store_true",
  )
 
  # args for multi-speaker synthesis
@@ -261,25 +245,18 @@ def main():
  parser.add_argument(
  "--list_speaker_idxs",
  help="List available speaker ids for the defined multi-speaker model.",
- type=str2bool,
- nargs="?",
- const=True,
- default=False,
+ action="store_true",
  )
  parser.add_argument(
  "--list_language_idxs",
  help="List available language ids for the defined multi-lingual model.",
- type=str2bool,
- nargs="?",
- const=True,
- default=False,
+ action="store_true",
  )
  # aux args
  parser.add_argument(
  "--save_spectogram",
- type=bool,
- help="If true save raw spectogram for further (vocoder) processing in out_path.",
- default=False,
+ action="store_true",
+ help="Save raw spectogram for further (vocoder) processing in out_path.",
  )
  parser.add_argument(
  "--reference_wav",
@@ -295,8 +272,8 @@ def main():
  )
  parser.add_argument(
  "--progress_bar",
- type=str2bool,
- help="If true shows a progress bar for the model download. Defaults to True",
+ action=argparse.BooleanOptionalAction,
+ help="Show a progress bar for the model download.",
  default=True,
  )
 
@@ -337,19 +314,23 @@ def main():
  ]
  if not any(check_args):
  parser.parse_args(["-h"])
+ return args
+
+
+def main():
+ setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+ args = parse_args()
 
  pipe_out = sys.stdout if args.pipe_out else None
 
  with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
  # Late-import to make things load faster
- from TTS.api import TTS
  from TTS.utils.manage import ModelManager
  from TTS.utils.synthesizer import Synthesizer
 
  # load model manager
  path = Path(__file__).parent / "../.models.json"
  manager = ModelManager(path, progress_bar=args.progress_bar)
- api = TTS()
 
  tts_path = None
  tts_config_path = None