Skip to content

Commit

Permalink
Add char limit warn (#3130)
Browse files Browse the repository at this point in the history
* Add char limit warning

* Adding v2 langs

* cached_property for cutlet

* Fix import
  • Loading branch information
WeberJulian authored Nov 8, 2023
1 parent f846a9f commit ce1a39a
Showing 1 changed file with 41 additions and 1 deletion.
42 changes: 41 additions & 1 deletion TTS/tts/layers/xtts/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from hangul_romanize.rule import academic
from num2words import num2words
from tokenizers import Tokenizer
from functools import cached_property

from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words

Expand Down Expand Up @@ -535,11 +536,50 @@ def korean_cleaners(text):
class VoiceBpeTokenizer:
def __init__(self, vocab_file=None):
self.tokenizer = None
self.katsu = None
if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file)
self.char_limits = {
"en": 250,
"de": 253,
"fr": 273,
"es": 239,
"it": 213,
"pt": 203,
"pl": 224,
"zh-cn": 82,
"ar": 166,
"cs": 186,
"ru": 182,
"nl": 251,
"tr": 226,
"ja": 71,
"hu": 224,
"ko": 95,
}

@cached_property
def katsu(self):
import cutlet
return cutlet.Cutlet()

def check_input_length(self, txt, lang):
limit = self.char_limits.get(lang, 250)
if len(txt) > limit:
print(f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio.")

def preprocess_text(self, txt, lang):
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
txt = multilingual_cleaners(txt, lang)
if lang == "zh-cn":
txt = chinese_transliterate(txt)
elif lang == "ja":
txt = japanese_cleaners(txt, self.katsu)
else:
raise NotImplementedError()
return txt

def encode(self, txt, lang):
self.check_input_length(txt, lang)
txt = self.preprocess_text(txt, lang)
txt = f"[{lang}]{txt}"
txt = txt.replace(" ", "[SPACE]")
Expand Down

0 comments on commit ce1a39a

Please sign in to comment.