diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1ef655a3cc..424c8bb35f 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -20,7 +20,7 @@ def get_spacy_lang(lang): - if lang == "zh": + if lang in ["zh", "zh-cn"]: return Chinese() elif lang == "ja": return Japanese() @@ -170,7 +170,7 @@ def split_sentence(text, lang, text_split_length=250): # There are not many common abbreviations in Arabic as in English. ] ], - "zh": [ + "zh-cn": [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. @@ -335,7 +335,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ("°", " درجة "), ] ], - "zh": [ + "zh-cn": [ # Chinese (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ @@ -519,7 +519,7 @@ def _expand_number(m, lang="en"): def expand_numbers_multilingual(text, lang="en"): - if lang == "zh": + if lang in ["zh", "zh-cn"]: text = zh_num2words()(text) else: if lang in ["en", "ru"]: @@ -602,6 +602,7 @@ def __init__(self, vocab_file=None): "pt": 203, "pl": 224, "zh": 82, + "zh-cn": 82, "ar": 166, "cs": 186, "ru": 182, @@ -627,9 +628,9 @@ def check_input_length(self, txt, lang): ) def preprocess_text(self, txt, lang): - if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}: + if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "zh-cn", "ko"}: txt = multilingual_cleaners(txt, lang) - if lang == "zh": + if lang == "zh" or lang == "zh-cn": txt = chinese_transliterate(txt) if lang == "ko": txt = korean_transliterate(txt)