From 27b1bf474645508abd08beffcb08da0dbdfa6417 Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Mon, 19 Feb 2024 16:40:30 +0800 Subject: [PATCH 1/9] minor fix of vits/tokenizer.py --- egs/ljspeech/TTS/vits/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/ljspeech/TTS/vits/tokenizer.py b/egs/ljspeech/TTS/vits/tokenizer.py index 70f1240b4a..7a6b7f3bf2 100644 --- a/egs/ljspeech/TTS/vits/tokenizer.py +++ b/egs/ljspeech/TTS/vits/tokenizer.py @@ -74,7 +74,7 @@ def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True): if intersperse_blank: token_ids = intersperse(token_ids, self.blank_id) - token_ids_list.append(token_ids) + token_ids_list.append(token_ids) return token_ids_list From ff6784d147aded9f9198d79e6a7bee0b60c8002d Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Mon, 19 Feb 2024 16:51:17 +0800 Subject: [PATCH 2/9] minor fix of vits/tokenizer.py --- egs/ljspeech/TTS/vits/tokenizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/egs/ljspeech/TTS/vits/tokenizer.py b/egs/ljspeech/TTS/vits/tokenizer.py index 7a6b7f3bf2..b0afc6a044 100644 --- a/egs/ljspeech/TTS/vits/tokenizer.py +++ b/egs/ljspeech/TTS/vits/tokenizer.py @@ -103,6 +103,7 @@ def tokens_to_token_ids( if intersperse_blank: token_ids = intersperse(token_ids, self.blank_id) - token_ids_list.append(token_ids) + + token_ids_list.append(token_ids) return token_ids_list From 2cf5891c154edd5493ef66a56fc170099ed0ec4b Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Tue, 20 Feb 2024 17:45:56 +0800 Subject: [PATCH 3/9] use piper_phonemize as text tokenizer in ljspeech recipe --- egs/ljspeech/TTS/local/prepare_token_file.py | 65 +++++-------------- .../TTS/local/prepare_tokens_ljspeech.py | 9 ++- egs/ljspeech/TTS/prepare.sh | 14 ++-- egs/ljspeech/TTS/vits/tokenizer.py | 42 ++++++++++-- 4 files changed, 66 insertions(+), 64 deletions(-) diff --git a/egs/ljspeech/TTS/local/prepare_token_file.py b/egs/ljspeech/TTS/local/prepare_token_file.py index df976804ab..29e4a50c9f 100755 --- a/egs/ljspeech/TTS/local/prepare_token_file.py +++ b/egs/ljspeech/TTS/local/prepare_token_file.py @@ -17,7 +17,7 @@ """ -This file reads the texts in given manifest and generates the file that maps tokens to IDs. +This file generates the file that maps tokens to IDs. """ import argparse @@ -25,80 +25,47 @@ from pathlib import Path from typing import Dict -from lhotse import load_manifest +from piper_phonemize import get_espeak_map def get_args(): parser = argparse.ArgumentParser() - parser.add_argument( - "--manifest-file", - type=Path, - default=Path("data/spectrogram/ljspeech_cuts_train.jsonl.gz"), - help="Path to the manifest file", - ) - parser.add_argument( "--tokens", type=Path, default=Path("data/tokens.txt"), - help="Path to the tokens", + help="Path to the dict that maps the text tokens to IDs", ) return parser.parse_args() -def write_mapping(filename: str, sym2id: Dict[str, int]) -> None: - """Write a symbol to ID mapping to a file. - - Note: - No need to implement `read_mapping` as it can be done - through :func:`k2.SymbolTable.from_file`. - - Args: - filename: - Filename to save the mapping. - sym2id: - A dict mapping symbols to IDs. - Returns: - Return None. - """ - with open(filename, "w", encoding="utf-8") as f: - for sym, i in sym2id.items(): - f.write(f"{sym} {i}\n") - - -def get_token2id(manifest_file: Path) -> Dict[str, int]: - """Return a dict that maps token to IDs.""" +def get_token2id(filename: Path) -> Dict[str, int]: + """Get a dict that maps token to IDs, and save it to the given filename.""" extra_tokens = [ "", # 0 for blank - "", # 1 for sos and eos symbols. - "", # 2 for OOV + "", # 1 for sos + "", # 2 for eos + "", # 3 for OOV ] - all_tokens = set() - cut_set = load_manifest(manifest_file) + all_tokens = list(get_espeak_map().keys()) - for cut in cut_set: - # Each cut only contain one supervision - assert len(cut.supervisions) == 1, len(cut.supervisions) - for t in cut.tokens: - all_tokens.add(t) + for t in extra_tokens: + assert t not in all_tokens, t - all_tokens = extra_tokens + list(all_tokens) + all_tokens = extra_tokens + all_tokens - token2id: Dict[str, int] = {token: i for i, token in enumerate(all_tokens)} - return token2id + with open(filename, "w", encoding="utf-8") as f: + for i, token in enumerate(all_tokens): + f.write(f"{token} {i}\n") if __name__ == "__main__": formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - logging.basicConfig(format=formatter, level=logging.INFO) args = get_args() - manifest_file = Path(args.manifest_file) out_file = Path(args.tokens) - - token2id = get_token2id(manifest_file) - write_mapping(out_file, token2id) + get_token2id(out_file) diff --git a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py index fcd0137a08..56361cf9a9 100755 --- a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py +++ b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py @@ -23,9 +23,9 @@ import logging from pathlib import Path -import g2p_en import tacotron_cleaner.cleaners from lhotse import CutSet, load_manifest +from piper_phonemize import phonemize_espeak def prepare_tokens_ljspeech(): @@ -35,7 +35,6 @@ def prepare_tokens_ljspeech(): partition = "all" cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}") - g2p = g2p_en.G2p() new_cuts = [] for cut in cut_set: @@ -45,7 +44,11 @@ def prepare_tokens_ljspeech(): # Text normalization text = tacotron_cleaner.cleaners.custom_english_cleaners(text) # Convert to phonemes - cut.tokens = g2p(text) + tokens_list = phonemize_espeak(text, "en-us") + tokens = [] + for t in tokens_list: + tokens.extend(t) + cut.tokens = tokens new_cuts.append(cut) new_cut_set = CutSet.from_cuts(new_cuts) diff --git a/egs/ljspeech/TTS/prepare.sh b/egs/ljspeech/TTS/prepare.sh index ed0a07f5e2..890bc841f8 100755 --- a/egs/ljspeech/TTS/prepare.sh +++ b/egs/ljspeech/TTS/prepare.sh @@ -30,7 +30,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then cd vits/monotonic_align python setup.py build_ext --inplace cd ../../ - else + else log "monotonic_align lib already built" fi fi @@ -80,6 +80,10 @@ fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Prepare phoneme tokens for LJSpeech" + # We assume you have installed piper_phonemize and espnet_tts_frontend. + # If not, please install them with: + # - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize + # - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then ./local/prepare_tokens_ljspeech.py mv data/spectrogram/ljspeech_cuts_with_tokens_all.jsonl.gz \ @@ -113,13 +117,11 @@ fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Generate token file" - # We assume you have installed g2p_en and espnet_tts_frontend. + # We assume you have installed piper_phonemize and espnet_tts_frontend. # If not, please install them with: - # - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p + # - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize # - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ if [ ! -e data/tokens.txt ]; then - ./local/prepare_token_file.py \ - --manifest-file data/spectrogram/ljspeech_cuts_train.jsonl.gz \ - --tokens data/tokens.txt + ./local/prepare_token_file.py --tokens data/tokens.txt fi fi diff --git a/egs/ljspeech/TTS/vits/tokenizer.py b/egs/ljspeech/TTS/vits/tokenizer.py index b0afc6a044..64530fa335 100644 --- a/egs/ljspeech/TTS/vits/tokenizer.py +++ b/egs/ljspeech/TTS/vits/tokenizer.py @@ -16,8 +16,8 @@ from typing import Dict, List -import g2p_en import tacotron_cleaner.cleaners +from piper_phonemize import phonemize_espeak from utils import intersperse @@ -41,18 +41,28 @@ def __init__(self, tokens: str): self.token2id[token] = id self.blank_id = self.token2id[""] + self.sos_id = self.token2id[""] + self.eos_id = self.token2id[""] self.oov_id = self.token2id[""] self.vocab_size = len(self.token2id) - self.g2p = g2p_en.G2p() - - def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True): + def texts_to_token_ids( + self, + texts: List[str], + intersperse_blank: bool = True, + add_sos: bool = False, + add_eos: bool = False, + ): """ Args: texts: A list of transcripts. intersperse_blank: Whether to intersperse blanks in the token sequence. + add_sos: + Whether to add sos token at the start. + add_eos: + Whether to add eos token at the end. Returns: Return a list of token id list [utterance][token_id] @@ -63,7 +73,11 @@ def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True): # Text normalization text = tacotron_cleaner.cleaners.custom_english_cleaners(text) # Convert to phonemes - tokens = self.g2p(text) + tokens_list = phonemize_espeak(text, "en-us") + tokens = [] + for t in tokens_list: + tokens.extend(t) + token_ids = [] for t in tokens: if t in self.token2id: @@ -73,13 +87,21 @@ def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True): if intersperse_blank: token_ids = intersperse(token_ids, self.blank_id) + if add_sos: + token_ids = [self.sos_id] + token_ids + if add_eos: + token_ids = token_ids + [self.eos_id] token_ids_list.append(token_ids) return token_ids_list def tokens_to_token_ids( - self, tokens_list: List[str], intersperse_blank: bool = True + self, + tokens_list: List[str], + intersperse_blank: bool = True, + add_sos: bool = False, + add_eos: bool = False, ): """ Args: @@ -87,6 +109,10 @@ def tokens_to_token_ids( A list of token list, each corresponding to one utterance. intersperse_blank: Whether to intersperse blanks in the token sequence. + add_sos: + Whether to add sos token at the start. + add_eos: + Whether to add eos token at the end. Returns: Return a list of token id list [utterance][token_id] @@ -103,6 +129,10 @@ def tokens_to_token_ids( if intersperse_blank: token_ids = intersperse(token_ids, self.blank_id) + if add_sos: + token_ids = [self.sos_id] + token_ids + if add_eos: + token_ids = token_ids + [self.eos_id] token_ids_list.append(token_ids) From cb04833f8e6b48e599018ee44a175cde86fdc98d Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Tue, 20 Feb 2024 21:18:03 +0800 Subject: [PATCH 4/9] remove extra tokens --- egs/ljspeech/TTS/local/prepare_token_file.py | 19 +++----------- egs/ljspeech/TTS/prepare.sh | 6 +++-- egs/ljspeech/TTS/vits/tokenizer.py | 27 ++++++++++---------- 3 files changed, 20 insertions(+), 32 deletions(-) diff --git a/egs/ljspeech/TTS/local/prepare_token_file.py b/egs/ljspeech/TTS/local/prepare_token_file.py index 29e4a50c9f..dd76c1565c 100755 --- a/egs/ljspeech/TTS/local/prepare_token_file.py +++ b/egs/ljspeech/TTS/local/prepare_token_file.py @@ -43,23 +43,10 @@ def get_args(): def get_token2id(filename: Path) -> Dict[str, int]: """Get a dict that maps token to IDs, and save it to the given filename.""" - extra_tokens = [ - "", # 0 for blank - "", # 1 for sos - "", # 2 for eos - "", # 3 for OOV - ] - - all_tokens = list(get_espeak_map().keys()) - - for t in extra_tokens: - assert t not in all_tokens, t - - all_tokens = extra_tokens + all_tokens - + all_tokens = get_espeak_map() with open(filename, "w", encoding="utf-8") as f: - for i, token in enumerate(all_tokens): - f.write(f"{token} {i}\n") + for token, token_id in all_tokens.items(): + f.write(f"{token} {token_id[0]}\n") if __name__ == "__main__": diff --git a/egs/ljspeech/TTS/prepare.sh b/egs/ljspeech/TTS/prepare.sh index 890bc841f8..cbf27bd423 100755 --- a/egs/ljspeech/TTS/prepare.sh +++ b/egs/ljspeech/TTS/prepare.sh @@ -82,7 +82,8 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Prepare phoneme tokens for LJSpeech" # We assume you have installed piper_phonemize and espnet_tts_frontend. # If not, please install them with: - # - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize + # - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize, + # could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5 # - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then ./local/prepare_tokens_ljspeech.py @@ -119,7 +120,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Generate token file" # We assume you have installed piper_phonemize and espnet_tts_frontend. # If not, please install them with: - # - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize + # - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize, + # could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5 # - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ if [ ! -e data/tokens.txt ]; then ./local/prepare_token_file.py --tokens data/tokens.txt diff --git a/egs/ljspeech/TTS/vits/tokenizer.py b/egs/ljspeech/TTS/vits/tokenizer.py index 64530fa335..e005fc1845 100644 --- a/egs/ljspeech/TTS/vits/tokenizer.py +++ b/egs/ljspeech/TTS/vits/tokenizer.py @@ -38,12 +38,15 @@ def __init__(self, tokens: str): id = int(info[0]) else: token, id = info[0], int(info[1]) + assert token not in self.token2id, token self.token2id[token] = id - self.blank_id = self.token2id[""] - self.sos_id = self.token2id[""] - self.eos_id = self.token2id[""] - self.oov_id = self.token2id[""] + # Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md + self.pad_id = self.token2id["_"] # padding + self.sos_id = self.token2id["^"] # beginning of an utterance (bos) + self.eos_id = self.token2id["$"] # end of an utterance (eos) + self.space_id = self.token2id[" "] # word separator (whitespace) + self.vocab_size = len(self.token2id) def texts_to_token_ids( @@ -80,13 +83,11 @@ def texts_to_token_ids( token_ids = [] for t in tokens: - if t in self.token2id: - token_ids.append(self.token2id[t]) - else: - token_ids.append(self.oov_id) + assert t in self.token2id, t + token_ids.append(self.token2id[t]) if intersperse_blank: - token_ids = intersperse(token_ids, self.blank_id) + token_ids = intersperse(token_ids, self.pad_id) if add_sos: token_ids = [self.sos_id] + token_ids if add_eos: @@ -122,13 +123,11 @@ def tokens_to_token_ids( for tokens in tokens_list: token_ids = [] for t in tokens: - if t in self.token2id: - token_ids.append(self.token2id[t]) - else: - token_ids.append(self.oov_id) + assert t in self.token2id, t + token_ids.append(self.token2id[t]) if intersperse_blank: - token_ids = intersperse(token_ids, self.blank_id) + token_ids = intersperse(token_ids, self.pad_id) if add_sos: token_ids = [self.sos_id] + token_ids if add_eos: From 18514438013b7687cb2fde29114e6ecb5bcf4ebf Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Tue, 20 Feb 2024 22:14:46 +0800 Subject: [PATCH 5/9] minor updates --- egs/ljspeech/TTS/local/prepare_token_file.py | 10 +++++++--- .../TTS/local/prepare_tokens_ljspeech.py | 2 +- egs/ljspeech/TTS/vits/tokenizer.py | 20 +++++++++++++------ 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/egs/ljspeech/TTS/local/prepare_token_file.py b/egs/ljspeech/TTS/local/prepare_token_file.py index dd76c1565c..5b048b600b 100755 --- a/egs/ljspeech/TTS/local/prepare_token_file.py +++ b/egs/ljspeech/TTS/local/prepare_token_file.py @@ -43,10 +43,14 @@ def get_args(): def get_token2id(filename: Path) -> Dict[str, int]: """Get a dict that maps token to IDs, and save it to the given filename.""" - all_tokens = get_espeak_map() + all_tokens = get_espeak_map() # token: [token_id] + all_tokens = {token: token_id[0] for token, token_id in all_tokens.items()} + # sort by token_id + all_tokens = sorted(all_tokens.items(), key=lambda x: x[1]) + with open(filename, "w", encoding="utf-8") as f: - for token, token_id in all_tokens.items(): - f.write(f"{token} {token_id[0]}\n") + for token, token_id in all_tokens: + f.write(f"{token} {token_id}\n") if __name__ == "__main__": diff --git a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py index 56361cf9a9..08fe7430ef 100755 --- a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py +++ b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py @@ -39,7 +39,7 @@ def prepare_tokens_ljspeech(): new_cuts = [] for cut in cut_set: # Each cut only contains one supervision - assert len(cut.supervisions) == 1, len(cut.supervisions) + assert len(cut.supervisions) == 1, (len(cut.supervisions), cut) text = cut.supervisions[0].normalized_text # Text normalization text = tacotron_cleaner.cleaners.custom_english_cleaners(text) diff --git a/egs/ljspeech/TTS/vits/tokenizer.py b/egs/ljspeech/TTS/vits/tokenizer.py index e005fc1845..9a5a9090ec 100644 --- a/egs/ljspeech/TTS/vits/tokenizer.py +++ b/egs/ljspeech/TTS/vits/tokenizer.py @@ -1,4 +1,4 @@ -# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao) +# Copyright 2023-2024 Xiaomi Corp. (authors: Zengwei Yao) # # See ../../LICENSE for clarification regarding multiple authors # @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging from typing import Dict, List import tacotron_cleaner.cleaners @@ -55,7 +56,8 @@ def texts_to_token_ids( intersperse_blank: bool = True, add_sos: bool = False, add_eos: bool = False, - ): + lang: str = "en-us", + ) -> List[List[int]]: """ Args: texts: @@ -66,6 +68,8 @@ def texts_to_token_ids( Whether to add sos token at the start. add_eos: Whether to add eos token at the end. + lang: + Language argument passed to phonemize_espeak(). Returns: Return a list of token id list [utterance][token_id] @@ -76,14 +80,16 @@ def texts_to_token_ids( # Text normalization text = tacotron_cleaner.cleaners.custom_english_cleaners(text) # Convert to phonemes - tokens_list = phonemize_espeak(text, "en-us") + tokens_list = phonemize_espeak(text, lang) tokens = [] for t in tokens_list: tokens.extend(t) token_ids = [] for t in tokens: - assert t in self.token2id, t + if t not in self.token2id: + logging.warning(f"Skip OOV {t}") + continue token_ids.append(self.token2id[t]) if intersperse_blank: @@ -103,7 +109,7 @@ def tokens_to_token_ids( intersperse_blank: bool = True, add_sos: bool = False, add_eos: bool = False, - ): + ) -> List[List[int]]: """ Args: tokens_list: @@ -123,7 +129,9 @@ def tokens_to_token_ids( for tokens in tokens_list: token_ids = [] for t in tokens: - assert t in self.token2id, t + if t not in self.token2id: + logging.warning(f"Skip OOV {t}") + continue token_ids.append(self.token2id[t]) if intersperse_blank: From 595d4a3c47f5d3cd24f60fe071011aa458ac1716 Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Wed, 21 Feb 2024 17:50:19 +0800 Subject: [PATCH 6/9] modify usage of tokenizer in vits/train.py --- egs/ljspeech/TTS/vits/train.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/egs/ljspeech/TTS/vits/train.py b/egs/ljspeech/TTS/vits/train.py index 71c4224fa7..6589b75ff6 100755 --- a/egs/ljspeech/TTS/vits/train.py +++ b/egs/ljspeech/TTS/vits/train.py @@ -296,14 +296,16 @@ def prepare_input(batch: dict, tokenizer: Tokenizer, device: torch.device): features_lens = batch["features_lens"].to(device) tokens = batch["tokens"] - tokens = tokenizer.tokens_to_token_ids(tokens) + tokens = tokenizer.tokens_to_token_ids( + tokens, intersperse_blank=True, add_sos=True, add_eos=True + ) tokens = k2.RaggedTensor(tokens) row_splits = tokens.shape.row_splits(1) tokens_lens = row_splits[1:] - row_splits[:-1] tokens = tokens.to(device) tokens_lens = tokens_lens.to(device) # a tensor of shape (B, T) - tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id) + tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id) return audio, audio_lens, features, features_lens, tokens, tokens_lens @@ -742,8 +744,7 @@ def run(rank, world_size, args): logging.info(f"Device: {device}") tokenizer = Tokenizer(params.tokens) - params.blank_id = tokenizer.blank_id - params.oov_id = tokenizer.oov_id + params.blank_id = tokenizer.pad_id params.vocab_size = tokenizer.vocab_size logging.info(params) From ae83d8070d4c8144d6f6335c0c55f3ae115c453c Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Wed, 28 Feb 2024 20:37:42 +0800 Subject: [PATCH 7/9] minor updates related to the tokenizer change --- egs/ljspeech/TTS/vits/export-onnx.py | 3 +-- egs/ljspeech/TTS/vits/infer.py | 9 +++++---- egs/ljspeech/TTS/vits/test_onnx.py | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/egs/ljspeech/TTS/vits/export-onnx.py b/egs/ljspeech/TTS/vits/export-onnx.py index f82f9dbe9b..c607f0114b 100755 --- a/egs/ljspeech/TTS/vits/export-onnx.py +++ b/egs/ljspeech/TTS/vits/export-onnx.py @@ -218,8 +218,7 @@ def main(): params.update(vars(args)) tokenizer = Tokenizer(params.tokens) - params.blank_id = tokenizer.blank_id - params.oov_id = tokenizer.oov_id + params.blank_id = tokenizer.pad_id params.vocab_size = tokenizer.vocab_size logging.info(params) diff --git a/egs/ljspeech/TTS/vits/infer.py b/egs/ljspeech/TTS/vits/infer.py index cf0d20ae23..9e7c71c6dc 100755 --- a/egs/ljspeech/TTS/vits/infer.py +++ b/egs/ljspeech/TTS/vits/infer.py @@ -130,14 +130,16 @@ def _save_worker( batch_size = len(batch["tokens"]) tokens = batch["tokens"] - tokens = tokenizer.tokens_to_token_ids(tokens) + tokens = tokenizer.tokens_to_token_ids( + tokens, intersperse_blank=True, add_sos=True, add_eos=True + ) tokens = k2.RaggedTensor(tokens) row_splits = tokens.shape.row_splits(1) tokens_lens = row_splits[1:] - row_splits[:-1] tokens = tokens.to(device) tokens_lens = tokens_lens.to(device) # tensor of shape (B, T) - tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id) + tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id) audio = batch["audio"] audio_lens = batch["audio_lens"].tolist() @@ -201,8 +203,7 @@ def main(): device = torch.device("cuda", 0) tokenizer = Tokenizer(params.tokens) - params.blank_id = tokenizer.blank_id - params.oov_id = tokenizer.oov_id + params.blank_id = tokenizer.pad_id params.vocab_size = tokenizer.vocab_size logging.info(f"Device: {device}") diff --git a/egs/ljspeech/TTS/vits/test_onnx.py b/egs/ljspeech/TTS/vits/test_onnx.py index fcbc1d6632..4f46e8e6c5 100755 --- a/egs/ljspeech/TTS/vits/test_onnx.py +++ b/egs/ljspeech/TTS/vits/test_onnx.py @@ -108,7 +108,9 @@ def main(): model = OnnxModel(args.model_filename) text = "I went there to see the land, the people and how their system works, end quote." - tokens = tokenizer.texts_to_token_ids([text]) + tokens = tokenizer.texts_to_token_ids( + [text], intersperse_blank=True, add_sos=True, add_eos=True + ) tokens = torch.tensor(tokens) # (1, T) tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64) # (1, T) audio = model(tokens, tokens_lens) # (1, T') From 956e58fe833a489b00d022fe097a3d9ecc82e507 Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Wed, 28 Feb 2024 20:50:16 +0800 Subject: [PATCH 8/9] update docs --- docs/source/recipes/TTS/ljspeech/vits.rst | 4 ++-- docs/source/recipes/TTS/vctk/vits.rst | 4 ++-- requirements-tts.txt | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/source/recipes/TTS/ljspeech/vits.rst b/docs/source/recipes/TTS/ljspeech/vits.rst index d08aa0f470..7296fc98d7 100644 --- a/docs/source/recipes/TTS/ljspeech/vits.rst +++ b/docs/source/recipes/TTS/ljspeech/vits.rst @@ -1,11 +1,11 @@ -VITS +VITS-LJSpeech =============== This tutorial shows you how to train an VITS model with the `LJSpeech `_ dataset. .. note:: - + TTS related recipes require packages in ``requirements-tts.txt``. .. note:: diff --git a/docs/source/recipes/TTS/vctk/vits.rst b/docs/source/recipes/TTS/vctk/vits.rst index 34024a5ea5..45ae9d9d20 100644 --- a/docs/source/recipes/TTS/vctk/vits.rst +++ b/docs/source/recipes/TTS/vctk/vits.rst @@ -1,11 +1,11 @@ -VITS +VITS-VCTK =============== This tutorial shows you how to train an VITS model with the `VCTK `_ dataset. .. note:: - + TTS related recipes require packages in ``requirements-tts.txt``. .. note:: diff --git a/requirements-tts.txt b/requirements-tts.txt index c30e23d549..eae50ba7b5 100644 --- a/requirements-tts.txt +++ b/requirements-tts.txt @@ -3,4 +3,5 @@ matplotlib==3.8.2 cython==3.0.6 numba==0.58.1 g2p_en==2.1.0 -espnet_tts_frontend==0.0.3 \ No newline at end of file +espnet_tts_frontend==0.0.3 +# piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize, could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5 From 1d66426fa87470288ffca8aa8f80193e0ad8b194 Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Wed, 28 Feb 2024 23:52:36 +0800 Subject: [PATCH 9/9] update huggingface link --- docs/source/recipes/TTS/ljspeech/vits.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/recipes/TTS/ljspeech/vits.rst b/docs/source/recipes/TTS/ljspeech/vits.rst index 7296fc98d7..323d0adfc8 100644 --- a/docs/source/recipes/TTS/ljspeech/vits.rst +++ b/docs/source/recipes/TTS/ljspeech/vits.rst @@ -120,4 +120,4 @@ Download pretrained models If you don't want to train from scratch, you can download the pretrained models by visiting the following link: - - ``_ + - ``_