Skip to content

Commit

Permalink
Merge pull request #3251 from coqui-ai/dev
Browse files Browse the repository at this point in the history
v0.20.6
  • Loading branch information
erogol authored Nov 21, 2023
2 parents 6471273 + 29dede2 commit 2211ba2
Show file tree
Hide file tree
Showing 19 changed files with 419 additions and 506 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pypi-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
build-sdist:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Verify tag matches version
run: |
set -ex
Expand Down Expand Up @@ -38,7 +38,7 @@ jobs:
matrix:
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
Expand Down
4 changes: 2 additions & 2 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
"multilingual": {
"multi-dataset": {
"xtts_v2": {
"description": "XTTS-v2 by Coqui with 16 languages.",
"description": "XTTS-v2.0.2 by Coqui with 16 languages.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
],
"model_hash": "6a09d1ad43896f06041ed8195956c9698f13b6189dc80f1c74bdc2b8e8d15324",
"model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
"default_vocoder": null,
"commit": "480a6cdf7",
"license": "CPML",
Expand Down
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.20.5
0.20.6
11 changes: 6 additions & 5 deletions TTS/bin/extract_tts_spectrograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.audio.numpy_transforms import quantize
from TTS.utils.generic_utils import count_parameters

use_cuda = torch.cuda.is_available()
Expand Down Expand Up @@ -159,7 +160,7 @@ def inference(


def extract_spectrograms(
data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt"
data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
):
model.eval()
export_metadata = []
Expand Down Expand Up @@ -196,8 +197,8 @@ def extract_spectrograms(
_, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)

# quantize and save wav
if quantized_wav:
wavq = ap.quantize(wav)
if quantize_bits > 0:
wavq = quantize(wav, quantize_bits)
np.save(wavq_path, wavq)

# save TTS mel
Expand Down Expand Up @@ -263,7 +264,7 @@ def main(args): # pylint: disable=redefined-outer-name
model,
ap,
args.output_path,
quantized_wav=args.quantized,
quantize_bits=args.quantize_bits,
save_audio=args.save_audio,
debug=args.debug,
metada_name="metada.txt",
Expand All @@ -277,7 +278,7 @@ def main(args): # pylint: disable=redefined-outer-name
parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
args = parser.parse_args()

Expand Down
12 changes: 10 additions & 2 deletions TTS/tts/layers/tortoise/diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,18 @@
import numpy as np
import torch
import torch as th
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
from tqdm import tqdm

from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper

K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
try:
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral

K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
except ImportError:
K_DIFFUSION_SAMPLERS = None


SAMPLERS = ["dpm++2m", "p", "ddim"]


Expand Down Expand Up @@ -531,6 +537,8 @@ def sample_loop(self, *args, **kwargs):
if self.conditioning_free is not True:
raise RuntimeError("cond_free must be true")
with tqdm(total=self.num_timesteps) as pbar:
if K_DIFFUSION_SAMPLERS is None:
raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
else:
raise RuntimeError("sampler not impl")
Expand Down
4 changes: 3 additions & 1 deletion TTS/tts/layers/xtts/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,9 @@ def forward(
audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)

# Pad mel codes with stop_audio_token
audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
audio_codes = self.set_mel_padding(
audio_codes, code_lengths - 3
) # -3 to get the real code lengths without consider start and stop tokens that was not added yet

# Build input and target tensors
# Prepend start token to inputs and append stop token to targets
Expand Down
85 changes: 72 additions & 13 deletions TTS/tts/layers/xtts/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,73 @@
import json
import os
import re
import textwrap
from functools import cached_property

import pypinyin
import torch
from hangul_romanize import Transliter
from hangul_romanize.rule import academic
from num2words import num2words
from spacy.lang.ar import Arabic
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.ja import Japanese
from spacy.lang.zh import Chinese
from tokenizers import Tokenizer

from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words


def get_spacy_lang(lang):
if lang == "zh":
return Chinese()
elif lang == "ja":
return Japanese()
elif lang == "ar":
return Arabic()
elif lang == "es":
return Spanish()
else:
# For most languages, Enlish does the job
return English()


def split_sentence(text, lang, text_split_length=250):
"""Preprocess the input text"""
text_splits = []
if text_split_length is not None and len(text) >= text_split_length:
text_splits.append("")
nlp = get_spacy_lang(lang)
nlp.add_pipe("sentencizer")
doc = nlp(text)
for sentence in doc.sents:
if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
# if the last sentence + the current sentence is less than the text_split_length
# then add the current sentence to the last sentence
text_splits[-1] += " " + str(sentence)
text_splits[-1] = text_splits[-1].lstrip()
elif len(str(sentence)) > text_split_length:
# if the current sentence is greater than the text_split_length
for line in textwrap.wrap(
str(sentence),
width=text_split_length,
drop_whitespace=True,
break_on_hyphens=False,
tabsize=1,
):
text_splits.append(str(line))
else:
text_splits.append(str(sentence))

if len(text_splits) > 1:
if text_splits[0] == "":
del text_splits[0]
else:
text_splits = [text.lstrip()]

return text_splits


_whitespace_re = re.compile(r"\s+")

# List of (regular expression, replacement) pairs for abbreviations:
Expand Down Expand Up @@ -115,7 +171,7 @@
# There are not many common abbreviations in Arabic as in English.
]
],
"zh-cn": [
"zh": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
# Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
Expand Down Expand Up @@ -280,7 +336,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
("°", " درجة "),
]
],
"zh-cn": [
"zh": [
# Chinese
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
Expand Down Expand Up @@ -464,7 +520,7 @@ def _expand_number(m, lang="en"):


def expand_numbers_multilingual(text, lang="en"):
if lang == "zh" or lang == "zh-cn":
if lang == "zh":
text = zh_num2words()(text)
else:
if lang in ["en", "ru"]:
Expand Down Expand Up @@ -525,7 +581,7 @@ def japanese_cleaners(text, katsu):
return text


def korean_cleaners(text):
def korean_transliterate(text):
r = Transliter(academic)
return r.translit(text)

Expand All @@ -546,7 +602,7 @@ def __init__(self, vocab_file=None):
"it": 213,
"pt": 203,
"pl": 224,
"zh-cn": 82,
"zh": 82,
"ar": 166,
"cs": 186,
"ru": 182,
Expand All @@ -564,28 +620,31 @@ def katsu(self):
return cutlet.Cutlet()

def check_input_length(self, txt, lang):
lang = lang.split("-")[0] # remove the region
limit = self.char_limits.get(lang, 250)
if len(txt) > limit:
print(
f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
)

def preprocess_text(self, txt, lang):
if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh-cn", "zh-cn"}:
if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
txt = multilingual_cleaners(txt, lang)
if lang in {"zh", "zh-cn"}:
if lang == "zh":
txt = chinese_transliterate(txt)
if lang == "ko":
txt = korean_transliterate(txt)
elif lang == "ja":
txt = japanese_cleaners(txt, self.katsu)
elif lang == "ko":
txt = korean_cleaners(txt)
else:
raise NotImplementedError(f"Language '{lang}' is not supported.")
return txt

def encode(self, txt, lang):
lang = lang.split("-")[0] # remove the region
self.check_input_length(txt, lang)
txt = self.preprocess_text(txt, lang)
lang = "zh-cn" if lang == "zh" else lang
txt = f"[{lang}]{txt}"
txt = txt.replace(" ", "[SPACE]")
return self.tokenizer.encode(txt).ids
Expand Down Expand Up @@ -682,8 +741,8 @@ def test_expand_numbers_multilingual():
("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
# Chinese (Simplified)
("在12.5秒内", "在十二点五秒内", "zh-cn"),
("有50名士兵", "有五十名士兵", "zh-cn"),
("在12.5秒内", "在十二点五秒内", "zh"),
("有50名士兵", "有五十名士兵", "zh"),
# ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
# ("那将是20€先生", '那将是二十欧元先生', 'zh'),
# Turkish
Expand Down Expand Up @@ -764,7 +823,7 @@ def test_symbols_multilingual():
("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
("我的电量为 14%", "我的电量为 14 百分之", "zh-cn"),
("我的电量为 14%", "我的电量为 14 百分之", "zh"),
("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
Expand Down
7 changes: 4 additions & 3 deletions TTS/tts/layers/xtts/trainer/gpt_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,10 @@ def eval_step(self, batch, criterion):
batch["cond_idxs"] = None
return self.train_step(batch, criterion)

def on_epoch_start(self, trainer): # pylint: disable=W0613
# guarante that dvae will be in eval mode after .train() on evaluation end
self.dvae = self.dvae.eval()
def on_train_epoch_start(self, trainer):
trainer.model.eval() # the whole model to eval
# put gpt model in training mode
trainer.model.xtts.gpt.train()

def on_init_end(self, trainer): # pylint: disable=W0613
# ignore similarities.pth on clearml save/upload
Expand Down
Loading

0 comments on commit 2211ba2

Please sign in to comment.