From d04d5fd5e6c9ce4e4fa520db5d1b9aa80937be69 Mon Sep 17 00:00:00 2001 From: zyingt Date: Mon, 29 Jan 2024 21:35:40 +0800 Subject: [PATCH 01/13] Added multi-speaker support to VITS --- config/tts.json | 1 + egs/datasets/README.md | 31 +++++ egs/tts/VITS/README.md | 181 +++++++++++++++++++++++++++++- egs/tts/VITS/exp_config.json | 11 +- egs/tts/VITS/run.sh | 11 +- models/tts/base/tts_dataset.py | 3 + models/tts/vits/vits_dataset.py | 13 +++ models/tts/vits/vits_inference.py | 6 +- preprocessors/processor.py | 3 + utils/data_utils.py | 6 + 10 files changed, 259 insertions(+), 7 deletions(-) diff --git a/config/tts.json b/config/tts.json index b314a5f4..e580a6b1 100644 --- a/config/tts.json +++ b/config/tts.json @@ -16,6 +16,7 @@ // Directory names of processed data or extracted features "phone_dir": "phones", "use_phone": true, + "add_blank": true }, "model": { "text_token_num": 512, diff --git a/egs/datasets/README.md b/egs/datasets/README.md index 8e3d8cf8..ecb85087 100644 --- a/egs/datasets/README.md +++ b/egs/datasets/README.md @@ -6,6 +6,7 @@ Amphion support the following academic datasets (sort alphabetically): - [AudioCaps](#audiocaps) - [CSD](#csd) - [CustomSVCDataset](#customsvcdataset) + - [Hi-Fi TTS](#hifitts) - [KiSing](#kising) - [LibriLight](#librilight) - [LibriTTS](#libritts) @@ -73,6 +74,36 @@ We support custom dataset for Singing Voice Conversion. Organize your data in th ┣ ... ``` + +## Hi-Fi TTS + +Download the official Hi-Fi TTS dataset [here](https://www.openslr.org/109/). The file structure looks like below: + +```plaintext +[Hi-Fi TTS dataset path] + ┣ audio + ┃ ┣ 11614_other {Speaker_ID}_{SNR_subset} + ┃ ┃ ┣ 10547 {Book_ID} + ┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0001.flac + ┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0003.flac + ┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0004.flac + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ 92_manifest_clean_dev.json + ┣ 92_manifest_clean_test.json + ┣ 92_manifest_clean_train.json + ┣ ... + ┣ {Speaker_ID}_manifest_{SNR_subset}_{dataset_split}.json + ┣ ... + ┣ books_bandwidth.tsv + ┣ LICENSE.txt + ┣ readers_books_clean.txt + ┣ readers_books_other.txt + ┣ README.txt + +``` + ## KiSing Download the official KiSing dataset [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure looks like below: diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md index a5147c3a..e6229238 100644 --- a/egs/tts/VITS/README.md +++ b/egs/tts/VITS/README.md @@ -3,8 +3,8 @@ [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/Text-to-Speech) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/Text-to-Speech) -In this recipe, we will show how to train [VITS](https://arxiv.org/abs/2106.06103) using Amphion's infrastructure. VITS is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning. - +In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning. The detailed instructions for training [single speaker](#single-speaker-vits) and [multi-speaker](#multi-speaker-vits) VITS can be found below: +## Single Speaker VITS There are four stages in total: 1. Data preparation @@ -167,6 +167,183 @@ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction. +## Multi-speaker VITS +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download +You can use the commonly used multi-speaker TTS dataset to train TTS model, i.e., Hi-Fi TTS, LibriTTS etc. We strongly recommend you use Hi-Fi TTS to train TTS model for the first time. The process of downloading dataset is detailed [here](../../datasets/README.md). + +### Configuration + +After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "hifitts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "hifitts": "[Hi-Fi TTS dataset path]", + }, +``` + +## 2. Features Extraction + +### Configuration + +In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, specify the `processed_dir` for saving processed data, set `extract_audio` and `use_spkid` to `true`. + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + "extract_audio": true, + "use_phone": true, + // linguistic features + "extract_phone": true, + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + "sample_rate": 24000, //target sampling rate + "valid_file": "valid.json", //validation set + "use_spkid": true, //true: use speaker id for multi-speaker dataset + }, +``` + +### Run + +Run the `run.sh` as the preprocess stage (set `--stage 1`): + +```bash +sh egs/tts/VITS/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. Remember to specify the `n_speakers` according to the number of speakers in your dataset and set `multi_speaker_training` to `true`. + +```json + "model": { + // TODO: Fill in the number of speakers according to dataset used. The default value is 0 if not specified. + "n_speakers": 10 + }, + "train": { + "batch_size": 16, + "multi_speaker_training": true, + } +``` + +### Train From Scratch + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] +``` + +### Train From Existing Source + +We support training from existing source for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint. + +Setting `--resume true`, the training will resume from the **latest checkpoint** from the current `[YourExptName]` by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/tts/[YourExptName]/checkpoint`, run: + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ + --resume true +``` + +You can also choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to resume training from the checkpoint `Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]`, run: + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ + --resume true + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ +``` + +If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune the model from the checkpoint `Amphion/ckpts/tts/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, run: + + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ + --resume true + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ + --resume_type "finetune" +``` + +> **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training. +> +> The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint. + +Here are some example scenarios to better understand how to use these arguments: +| Scenario | `--resume` | `--resume_from_ckpt_path` | `--resume_type` | +| ------ | -------- | ----------------------- | ------------- | +| You want to train from scratch | no | no | no | +| The machine breaks down during training and you want to resume training from the latest checkpoint | `true` | no | no | +| You find the latest model is overfitting and you want to re-train from the checkpoint before | `true` | `SpecificCheckpoint Path` | no | +| You want to fine-tune a model from another checkpoint | `true` | `SpecificCheckpoint Path` | `"finetune"` | + + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + + +## 4. Inference + +### Configuration + +For inference, you need to specify the following configurations when running `run.sh`: + + +| Parameters | Description | Example | +| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | +| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | +| `--infer_dataset` | The dataset used for inference. | For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test | For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | +| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | +| `--infer_speaker_name` | The target speaker's voice to be synthesized. | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | + +### Run +For example, if you want to generate speech from all testing set split from Hi-Fi TTS, just run: + +```bash +sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "batch" \ + --infer_dataset "hifitts" \ + --infer_testing_set "test" +``` + +Or, if you want to generate a single clip of speech from a given text, just run: + +```bash +sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." \ + --infer_speaker_name "hifitts_92" +``` + +We will release a pre-trained multi-speaker VITS model trained on Hi-Fi TTS soon. Stay tuned! + ```bibtex @inproceedings{kim2021conditional, diff --git a/egs/tts/VITS/exp_config.json b/egs/tts/VITS/exp_config.json index b210a265..a385a7c4 100644 --- a/egs/tts/VITS/exp_config.json +++ b/egs/tts/VITS/exp_config.json @@ -11,17 +11,22 @@ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" "log_dir": "ckpts/tts", "preprocess": { + //"extract_audio":true, "use_phone": true, // linguistic features "extract_phone": true, "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "data", - - "sample_rate": 22050, - "valid_file": "test.json", // validattion set + "sample_rate": 22050, // target sampling rate + "valid_file": "valid.json", // validation set + //"use_spkid": true //true: use speaker id for multi-speaker dataset + }, + "model":{ + //"n_speakers": 10 //Specify number of speakers for multi-speaker dataset }, "train": { "batch_size": 16, + //"multi_speaker_training": true //Set to true for multi-speaker training } } \ No newline at end of file diff --git a/egs/tts/VITS/run.sh b/egs/tts/VITS/run.sh index ad63b425..6e243dbc 100644 --- a/egs/tts/VITS/run.sh +++ b/egs/tts/VITS/run.sh @@ -18,7 +18,7 @@ cd $work_dir ######## Parse the Given Parameters from the Commond ########### # options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@") -options=$(getopt -o c:n:s --long gpu:,config:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage: -- "$@") +options=$(getopt -o c:n:s --long gpu:,config:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,infer_speaker_name:,name:,stage: -- "$@") eval set -- "$options" while true; do @@ -51,6 +51,8 @@ while true; do --infer_testing_set) shift; infer_testing_set=$1 ; shift ;; # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". --infer_text) shift; infer_text=$1 ; shift ;; + # [Only for Inference] The speaker voice to be delivered in the synthesized speech. It is only used when the inference model is "single". + --infer_speaker_name) shift; infer_speaker_name=$1 ; shift ;; --) shift ; break ;; *) echo "Invalid option: $1" exit 1 ;; @@ -153,6 +155,12 @@ if [ $running_stage -eq 3 ]; then elif [ "$infer_mode" = "batch" ]; then infer_text='' fi + + if [ -z "$infer_speaker_name" ]; then + infer_speaker_name=None + fi + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \ @@ -163,6 +171,7 @@ if [ $running_stage -eq 3 ]; then --dataset $infer_dataset \ --testing_set $infer_testing_set \ --text "$infer_text" \ + --speaker_name $infer_speaker_name \ --log_level debug diff --git a/models/tts/base/tts_dataset.py b/models/tts/base/tts_dataset.py index b3f6ac7c..f6c9acb1 100644 --- a/models/tts/base/tts_dataset.py +++ b/models/tts/base/tts_dataset.py @@ -209,6 +209,9 @@ def __init__(self, cfg, dataset, is_valid=False): phon_id_collator = phoneIDCollation(cfg, dataset=dataset) sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq) + if cfg.preprocess.add_blank: + sequence = intersperse(sequence,0) + self.utt2seq[utt] = sequence def __getitem__(self, index): diff --git a/models/tts/vits/vits_dataset.py b/models/tts/vits/vits_dataset.py index cd596894..19c71dd5 100644 --- a/models/tts/vits/vits_dataset.py +++ b/models/tts/vits/vits_dataset.py @@ -27,6 +27,19 @@ def __getitem__(self, index): def __len__(self): return super().__len__() + def get_metadata(self): + metadata_filter = [] + with open(self.metafile_path, "r", encoding="utf-8") as f: + metadata = json.load(f) + for utt_info in metadata: + duration = utt_info['Duration'] + frame_len = duration * self.cfg.preprocess.sample_rate // self.cfg.preprocess.hop_size + if frame_len < self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size: + continue + metadata_filter.append(utt_info) + + return metadata_filter + class VITSCollator(TTSCollator): """Zero-pads model inputs and targets based on number of frames per step""" diff --git a/models/tts/vits/vits_inference.py b/models/tts/vits/vits_inference.py index 6c3d385a..7154303a 100644 --- a/models/tts/vits/vits_inference.py +++ b/models/tts/vits/vits_inference.py @@ -14,6 +14,7 @@ from models.tts.vits.vits import SynthesizerTrn from processors.phone_extractor import phoneExtractor from text.text_token_collation import phoneIDCollation +from utils.data_utils import * class VitsInference(TTSInference): @@ -120,6 +121,9 @@ def inference_for_single_utterance( ) phone_id_seq = phon_id_collator.get_phone_id_sequence(self.cfg, phone_seq) + if self.cfg.preprocess.add_blank: + phone_id_seq = intersperse(phone_id_seq, 0) + # convert phone sequence to phone id sequence phone_id_seq = np.array(phone_id_seq) phone_id_seq = torch.from_numpy(phone_id_seq) @@ -131,7 +135,7 @@ def inference_for_single_utterance( with open(spk2id_file, "r") as f: spk2id = json.load(f) speaker_id = spk2id[self.args.speaker_name] - speaker_id = torch.from_numpy(np.array([speaker_id], dtype=np.int32)) + speaker_id = torch.from_numpy(np.array([speaker_id], dtype=np.int32)).unsqueeze(0) with torch.no_grad(): x_tst = phone_id_seq.to(self.device).unsqueeze(0) diff --git a/preprocessors/processor.py b/preprocessors/processor.py index 1a1d0362..51c52a9d 100644 --- a/preprocessors/processor.py +++ b/preprocessors/processor.py @@ -29,6 +29,7 @@ vocalist, ljspeech_vocoder, librilight, + hifitts ) @@ -93,6 +94,8 @@ def preprocess_dataset( vocalist.main(output_path, dataset_path) if dataset == "librilight": librilight.main(output_path, dataset_path, cfg) + if dataset == "hifitts": + hifitts.main(output_path, dataset_path) def prepare_align(dataset, dataset_path, cfg, output_path): diff --git a/utils/data_utils.py b/utils/data_utils.py index 7976d050..b66817c5 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -12,6 +12,12 @@ from sklearn.preprocessing import StandardScaler + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + def load_content_feature_path(meta_data, processed_dir, feat_dir): utt2feat_path = {} for utt_info in meta_data: From c8820a6cbe67093141486fdd91bbbc890cd54ffe Mon Sep 17 00:00:00 2001 From: zyingt Date: Mon, 29 Jan 2024 22:11:10 +0800 Subject: [PATCH 02/13] Added multispeaker support to VITS --- egs/tts/VITS/run.sh | 14 +++++++------- models/tts/base/tts_dataset.py | 2 +- models/tts/vits/vits_dataset.py | 15 +++++++++++---- models/tts/vits/vits_inference.py | 4 +++- preprocessors/processor.py | 2 +- utils/data_utils.py | 4 ++-- 6 files changed, 25 insertions(+), 16 deletions(-) diff --git a/egs/tts/VITS/run.sh b/egs/tts/VITS/run.sh index 6e243dbc..dd702795 100644 --- a/egs/tts/VITS/run.sh +++ b/egs/tts/VITS/run.sh @@ -43,15 +43,15 @@ while true; do --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; - # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech. + # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generate a single clip of speech. --infer_mode) shift; infer_mode=$1 ; shift ;; - # [Only for Inference] The inference dataset. It is only used when the inference model is "batch". + # [Only for Inference] The inference dataset. It is only used when the inference mode is "batch". --infer_dataset) shift; infer_dataset=$1 ; shift ;; - # [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set. + # [Only for Inference] The inference testing set. It is only used when the inference mode is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set. --infer_testing_set) shift; infer_testing_set=$1 ; shift ;; - # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". + # [Only for Inference] The text to be synthesized from. It is only used when the inference mode is "single". --infer_text) shift; infer_text=$1 ; shift ;; - # [Only for Inference] The speaker voice to be delivered in the synthesized speech. It is only used when the inference model is "single". + # [Only for Inference] The chosen speaker's voice to be synthesized. It is only used when the inference mode is "single" for multi-speaker VITS. --infer_speaker_name) shift; infer_speaker_name=$1 ; shift ;; --) shift ; break ;; @@ -69,7 +69,7 @@ fi if [ -z "$exp_config" ]; then exp_config="${exp_dir}"/exp_config.json fi -echo "Exprimental Configuration File: $exp_config" +echo "Experimental Configuration File: $exp_config" if [ -z "$gpu" ]; then gpu="0" @@ -88,7 +88,7 @@ if [ $running_stage -eq 2 ]; then echo "[Error] Please specify the experiments name" exit 1 fi - echo "Exprimental Name: $exp_name" + echo "Experimental Name: $exp_name" # add default value if [ -z "$resume_from_ckpt_path" ]; then diff --git a/models/tts/base/tts_dataset.py b/models/tts/base/tts_dataset.py index f6c9acb1..fc85afb9 100644 --- a/models/tts/base/tts_dataset.py +++ b/models/tts/base/tts_dataset.py @@ -210,7 +210,7 @@ def __init__(self, cfg, dataset, is_valid=False): sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq) if cfg.preprocess.add_blank: - sequence = intersperse(sequence,0) + sequence = intersperse(sequence, 0) self.utt2seq[utt] = sequence diff --git a/models/tts/vits/vits_dataset.py b/models/tts/vits/vits_dataset.py index 19c71dd5..291fcb55 100644 --- a/models/tts/vits/vits_dataset.py +++ b/models/tts/vits/vits_dataset.py @@ -32,12 +32,19 @@ def get_metadata(self): with open(self.metafile_path, "r", encoding="utf-8") as f: metadata = json.load(f) for utt_info in metadata: - duration = utt_info['Duration'] - frame_len = duration * self.cfg.preprocess.sample_rate // self.cfg.preprocess.hop_size - if frame_len < self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size: + duration = utt_info["Duration"] + frame_len = ( + duration + * self.cfg.preprocess.sample_rate + // self.cfg.preprocess.hop_size + ) + if ( + frame_len + < self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size + ): continue metadata_filter.append(utt_info) - + return metadata_filter diff --git a/models/tts/vits/vits_inference.py b/models/tts/vits/vits_inference.py index 7154303a..d0563dbf 100644 --- a/models/tts/vits/vits_inference.py +++ b/models/tts/vits/vits_inference.py @@ -135,7 +135,9 @@ def inference_for_single_utterance( with open(spk2id_file, "r") as f: spk2id = json.load(f) speaker_id = spk2id[self.args.speaker_name] - speaker_id = torch.from_numpy(np.array([speaker_id], dtype=np.int32)).unsqueeze(0) + speaker_id = torch.from_numpy( + np.array([speaker_id], dtype=np.int32) + ).unsqueeze(0) with torch.no_grad(): x_tst = phone_id_seq.to(self.device).unsqueeze(0) diff --git a/preprocessors/processor.py b/preprocessors/processor.py index 51c52a9d..037ac6c5 100644 --- a/preprocessors/processor.py +++ b/preprocessors/processor.py @@ -29,7 +29,7 @@ vocalist, ljspeech_vocoder, librilight, - hifitts + hifitts, ) diff --git a/utils/data_utils.py b/utils/data_utils.py index b66817c5..e8762053 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -12,12 +12,12 @@ from sklearn.preprocessing import StandardScaler - def intersperse(lst, item): result = [item] * (len(lst) * 2 + 1) result[1::2] = lst return result - + + def load_content_feature_path(meta_data, processed_dir, feat_dir): utt2feat_path = {} for utt_info in meta_data: From ec033d3de09d76da157db3349263642f9601c9ff Mon Sep 17 00:00:00 2001 From: zyingt Date: Tue, 30 Jan 2024 16:57:16 +0800 Subject: [PATCH 03/13] Multi-speaker VITS support --- models/tts/vits/vits_inference.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/models/tts/vits/vits_inference.py b/models/tts/vits/vits_inference.py index d0563dbf..5e28858a 100644 --- a/models/tts/vits/vits_inference.py +++ b/models/tts/vits/vits_inference.py @@ -134,7 +134,12 @@ def inference_for_single_utterance( spk2id_file = os.path.join(self.exp_dir, self.cfg.preprocess.spk2id) with open(spk2id_file, "r") as f: spk2id = json.load(f) - speaker_id = spk2id[self.args.speaker_name] + speaker_name = self.args.speaker_name + assert ( + speaker_name in spk2id + ), f"Speaker {speaker_name} not found in the spk2id keys. \ + Please make sure you've specified the correct speaker name in infer_speaker_name." + speaker_id = spk2id[speaker_name] speaker_id = torch.from_numpy( np.array([speaker_id], dtype=np.int32) ).unsqueeze(0) From b4a1d3d8b99993f66b6b0ac946a7b66ae7e68b3c Mon Sep 17 00:00:00 2001 From: zyingt Date: Sun, 4 Feb 2024 15:45:04 +0800 Subject: [PATCH 04/13] Multi-speaker VITS support --- egs/tts/VITS/exp_config.json | 6 +++--- models/tts/base/tts_dataset.py | 2 +- models/tts/vits/vits_dataset.py | 5 +---- models/tts/vits/vits_inference.py | 2 +- preprocessors/hifitts.py | 2 +- 5 files changed, 7 insertions(+), 10 deletions(-) diff --git a/egs/tts/VITS/exp_config.json b/egs/tts/VITS/exp_config.json index a385a7c4..d030aac9 100644 --- a/egs/tts/VITS/exp_config.json +++ b/egs/tts/VITS/exp_config.json @@ -20,13 +20,13 @@ "processed_dir": "data", "sample_rate": 22050, // target sampling rate "valid_file": "valid.json", // validation set - //"use_spkid": true //true: use speaker id for multi-speaker dataset + //"use_spkid": true // true: use speaker ID for multi-speaker dataset }, "model":{ - //"n_speakers": 10 //Specify number of speakers for multi-speaker dataset + //"n_speakers": 10 // specify number of speakers for multi-speaker dataset }, "train": { "batch_size": 16, - //"multi_speaker_training": true //Set to true for multi-speaker training + //"multi_speaker_training": true //true: enable multi-speaker training } } \ No newline at end of file diff --git a/models/tts/base/tts_dataset.py b/models/tts/base/tts_dataset.py index fc85afb9..0142b160 100644 --- a/models/tts/base/tts_dataset.py +++ b/models/tts/base/tts_dataset.py @@ -209,7 +209,7 @@ def __init__(self, cfg, dataset, is_valid=False): phon_id_collator = phoneIDCollation(cfg, dataset=dataset) sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq) - if cfg.preprocess.add_blank: + if cfg.preprocess.add_blank and cfg.train.multi_speaker_training: sequence = intersperse(sequence, 0) self.utt2seq[utt] = sequence diff --git a/models/tts/vits/vits_dataset.py b/models/tts/vits/vits_dataset.py index 291fcb55..e3a1444b 100644 --- a/models/tts/vits/vits_dataset.py +++ b/models/tts/vits/vits_dataset.py @@ -62,11 +62,8 @@ def __call__(self, batch): class VITSTestDataset(TTSTestDataset): def __init__(self, args, cfg): super().__init__(args, cfg) - + processed_data_dir = os.path.join(cfg.preprocess.processed_dir, args.dataset) if cfg.preprocess.use_spkid: - processed_data_dir = os.path.join( - cfg.preprocess.processed_dir, args.dataset - ) spk2id_path = os.path.join(processed_data_dir, cfg.preprocess.spk2id) with open(spk2id_path, "r") as f: self.spk2id = json.load(f) diff --git a/models/tts/vits/vits_inference.py b/models/tts/vits/vits_inference.py index 5e28858a..b0a45ebc 100644 --- a/models/tts/vits/vits_inference.py +++ b/models/tts/vits/vits_inference.py @@ -121,7 +121,7 @@ def inference_for_single_utterance( ) phone_id_seq = phon_id_collator.get_phone_id_sequence(self.cfg, phone_seq) - if self.cfg.preprocess.add_blank: + if self.cfg.preprocess.add_blank and self.cfg.train.multi_speaker_training: phone_id_seq = intersperse(phone_id_seq, 0) # convert phone sequence to phone id sequence diff --git a/preprocessors/hifitts.py b/preprocessors/hifitts.py index bb5b2317..78b069ca 100644 --- a/preprocessors/hifitts.py +++ b/preprocessors/hifitts.py @@ -60,7 +60,7 @@ def main(output_path, dataset_path): entry = json.loads(line) utt_path = entry.get("audio_filepath") chosen_book = utt_path.split("/")[-2] - chosen_uid = utt_path.split("/")[-1] + chosen_uid = utt_path.split("/")[-1].split(".")[0] duration = entry.get("duration") text = entry.get("text_normalized") path = os.path.join(hifitts_path, utt_path) From 8ef137c2ae69cbddfc406dca51e113a4062836c1 Mon Sep 17 00:00:00 2001 From: zyingt Date: Thu, 8 Feb 2024 14:01:00 +0800 Subject: [PATCH 05/13] Merge README.md, added function comment --- bins/tts/preprocess.py | 4 +- egs/tts/VITS/README.md | 215 ++++++---------------------------- processors/phone_extractor.py | 7 +- utils/data_utils.py | 1 + 4 files changed, 44 insertions(+), 183 deletions(-) diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py index 39e955c8..914c0b44 100644 --- a/bins/tts/preprocess.py +++ b/bins/tts/preprocess.py @@ -88,11 +88,11 @@ def extract_phonme_sequences(dataset, output_path, cfg, dataset_types): dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) with open(dataset_file, "r") as f: metadata.extend(json.load(f)) - phone_extractor.extract_utt_phone_sequence(cfg, metadata) + phone_extractor.extract_utt_phone_sequence(dataset, cfg, metadata) def preprocess(cfg, args): - """Proprocess raw data of single or multiple datasets (in cfg.dataset) + """Preprocess raw data of single or multiple datasets (in cfg.dataset) Args: cfg (dict): dictionary that stores configurations diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md index e6229238..027c8a23 100644 --- a/egs/tts/VITS/README.md +++ b/egs/tts/VITS/README.md @@ -3,8 +3,8 @@ [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/Text-to-Speech) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/Text-to-Speech) -In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning. The detailed instructions for training [single speaker](#single-speaker-vits) and [multi-speaker](#multi-speaker-vits) VITS can be found below: -## Single Speaker VITS +In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning. + There are four stages in total: 1. Data preparation @@ -20,7 +20,7 @@ There are four stages in total: ## 1. Data Preparation ### Dataset Download -You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md). +You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, Hi-Fi TTS, LibriTTS, etc. We strongly recommend using LJSpeech to train single-speaker TTS model for the first time. While for training multi-speaker TTS model for the first time, we would recommend using Hi-Fi TTS. The process of downloading dataset has been detailed [here](../../datasets/README.md). ### Configuration @@ -29,10 +29,12 @@ After downloading the dataset, you can set the dataset paths in `exp_config.jso ```json "dataset": [ "LJSpeech", + //"hifitts" ], "dataset_path": { // TODO: Fill in your dataset path "LJSpeech": "[LJSpeech dataset path]", + //"hifitts": "[Hi-Fi TTS dataset path] }, ``` @@ -40,15 +42,22 @@ After downloading the dataset, you can set the dataset paths in `exp_config.jso ### Configuration -Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`: +In `exp_config.json`:
Specify the `log_dir` for saving the checkpoints and logs, specify the `processed_dir` for saving processed data. For preprocessing multi-speaker TTS dataset, set `extract_audio` and `use_spkid` to `true`: ```json // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" "log_dir": "ckpts/tts", "preprocess": { + //"extract_audio": true,//set to true for multi-speaker TTS model + "use_phone": true, + // linguistic features + "extract_phone": true, + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "data", - ... + "sample_rate": 22050, //target sampling rate + "valid_file": "valid.json", //validation set + //"use_spkid": true, //set to true for multi-speaker TTS model }, ``` @@ -67,11 +76,16 @@ sh egs/tts/VITS/run.sh --stage 1 ### Configuration We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. +For training multi-speaker TTS model, specify the `n_speakers` according to the number of speakers in your dataset(s) and set `multi_speaker_training` to `true`. -``` -"train": { - "batch_size": 16, - } +```json + "model": { + //"n_speakers": 10 //for multi-speaker TTS model: Fill in the number of speakers according to dataset used. The default value is 0 if not specified. + }, + "train": { + "batch_size": 16, + //"multi_speaker_training": true, //for multi-speaker TTS model + } ``` ### Train From Scratch @@ -139,11 +153,13 @@ For inference, you need to specify the following configurations when running `ru | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | | `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | -| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. | -| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. | +| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`.
For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set.
For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | | `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | +| `--infer_speaker_name` | The target speaker's voice to be synthesized.
(***Note: only applicable to multi-speaker TTS model***) | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | ### Run +#### Batch inference: For example, if you want to generate speech of all testing set split from LJSpeech, just run: ```bash @@ -154,185 +170,28 @@ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_dataset "LJSpeech" \ --infer_testing_set "test" ``` - -Or, if you want to generate a single clip of speech from a given text, just run: - +The same procedure follows for inferencing on multi-speaker dataset, with ```LJSpeech``` replaced by ```hifitts```. ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ - --infer_mode "single" \ - --infer_text "This is a clip of generated speech with the given text from a TTS model." -``` - -We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction. - -## Multi-speaker VITS -There are four stages in total: - -1. Data preparation -2. Features extraction -3. Training -4. Inference - -> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: -> ```bash -> cd Amphion -> ``` - -## 1. Data Preparation - -### Dataset Download -You can use the commonly used multi-speaker TTS dataset to train TTS model, i.e., Hi-Fi TTS, LibriTTS etc. We strongly recommend you use Hi-Fi TTS to train TTS model for the first time. The process of downloading dataset is detailed [here](../../datasets/README.md). - -### Configuration - -After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. - -```json - "dataset": [ - "hifitts", - ], - "dataset_path": { - // TODO: Fill in your dataset path - "hifitts": "[Hi-Fi TTS dataset path]", - }, -``` - -## 2. Features Extraction - -### Configuration - -In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, specify the `processed_dir` for saving processed data, set `extract_audio` and `use_spkid` to `true`. - -```json - // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" - "log_dir": "ckpts/tts", - "preprocess": { - "extract_audio": true, - "use_phone": true, - // linguistic features - "extract_phone": true, - "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" - // TODO: Fill in the output data path. The default value is "Amphion/data" - "processed_dir": "data", - "sample_rate": 24000, //target sampling rate - "valid_file": "valid.json", //validation set - "use_spkid": true, //true: use speaker id for multi-speaker dataset - }, -``` - -### Run - -Run the `run.sh` as the preprocess stage (set `--stage 1`): - -```bash -sh egs/tts/VITS/run.sh --stage 1 -``` - -> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. - -## 3. Training - -### Configuration - -We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. Remember to specify the `n_speakers` according to the number of speakers in your dataset and set `multi_speaker_training` to `true`. - -```json - "model": { - // TODO: Fill in the number of speakers according to dataset used. The default value is 0 if not specified. - "n_speakers": 10 - }, - "train": { - "batch_size": 16, - "multi_speaker_training": true, - } -``` - -### Train From Scratch - -Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. - -```bash -sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] -``` - -### Train From Existing Source - -We support training from existing source for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint. - -Setting `--resume true`, the training will resume from the **latest checkpoint** from the current `[YourExptName]` by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/tts/[YourExptName]/checkpoint`, run: - -```bash -sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ - --resume true -``` - -You can also choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to resume training from the checkpoint `Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]`, run: - -```bash -sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ - --resume true - --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ -``` - -If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune the model from the checkpoint `Amphion/ckpts/tts/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, run: - - -```bash -sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ - --resume true - --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ - --resume_type "finetune" + --infer_mode "batch" \ + --infer_dataset "hifitts" \ + --infer_testing_set "test" ``` -> **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training. -> -> The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint. - -Here are some example scenarios to better understand how to use these arguments: -| Scenario | `--resume` | `--resume_from_ckpt_path` | `--resume_type` | -| ------ | -------- | ----------------------- | ------------- | -| You want to train from scratch | no | no | no | -| The machine breaks down during training and you want to resume training from the latest checkpoint | `true` | no | no | -| You find the latest model is overfitting and you want to re-train from the checkpoint before | `true` | `SpecificCheckpoint Path` | no | -| You want to fine-tune a model from another checkpoint | `true` | `SpecificCheckpoint Path` | `"finetune"` | - - -> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. - - -## 4. Inference - -### Configuration - -For inference, you need to specify the following configurations when running `run.sh`: - - -| Parameters | Description | Example | -| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | -| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | -| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | -| `--infer_dataset` | The dataset used for inference. | For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. | -| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test | For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | -| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | -| `--infer_speaker_name` | The target speaker's voice to be synthesized. | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | - -### Run -For example, if you want to generate speech from all testing set split from Hi-Fi TTS, just run: +#### Single text inference: +For single-speaker TTS model, if you want to generate a single clip of speech from a given text, just run: ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ - --infer_mode "batch" \ - --infer_dataset "hifitts" \ - --infer_testing_set "test" + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." ``` -Or, if you want to generate a single clip of speech from a given text, just run: - +For multi-speaker TTS model, in addition to the above-mentioned arguments, you need to add ```infer_speaker_name``` argument, and run: ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ @@ -342,7 +201,7 @@ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_speaker_name "hifitts_92" ``` -We will release a pre-trained multi-speaker VITS model trained on Hi-Fi TTS soon. Stay tuned! +We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction. Meanwhile, the pre-trained multi-speaker VITS model trained on Hi-Fi TTS will be released soon. Stay tuned. ```bibtex diff --git a/processors/phone_extractor.py b/processors/phone_extractor.py index 26b59d8b..45a832dc 100644 --- a/processors/phone_extractor.py +++ b/processors/phone_extractor.py @@ -45,7 +45,7 @@ def __init__(self, cfg, dataset_name=None, phone_symbol_file=None): assert cfg.preprocess.lexicon_path != "" self.g2p_module = LexiconModule(cfg.preprocess.lexicon_path) else: - print("No suppert to", cfg.preprocess.phone_extractor) + print("No support to", cfg.preprocess.phone_extractor) raise def extract_phone(self, text): @@ -93,16 +93,17 @@ def save_dataset_phone_symbols_to_table(self): phone_symbol_dict.to_file(self.phone_symbols_file) -def extract_utt_phone_sequence(cfg, metadata): +def extract_utt_phone_sequence(dataset, cfg, metadata): """ Extract phone sequence from text Args: + dataset (str): name of dataset, e.g. opencpop cfg: config metadata: list of dict, each dict contains "Uid", "Text" """ - dataset_name = cfg.dataset[0] + dataset_name = dataset # output path out_path = os.path.join( diff --git a/utils/data_utils.py b/utils/data_utils.py index e8762053..ea9b271b 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -13,6 +13,7 @@ def intersperse(lst, item): + # Insert an item in between any consecutive elements of the given list, including beginning and end of list result = [item] * (len(lst) * 2 + 1) result[1::2] = lst return result From 8455a5f0834e30510a23cf1d8c3eb6cc478fcb28 Mon Sep 17 00:00:00 2001 From: zyingt Date: Thu, 8 Feb 2024 14:09:05 +0800 Subject: [PATCH 06/13] Revert "Merge README.md, added function comment" This reverts commit 8ef137c2ae69cbddfc406dca51e113a4062836c1. --- bins/tts/preprocess.py | 4 +- egs/tts/VITS/README.md | 215 ++++++++++++++++++++++++++++------ processors/phone_extractor.py | 7 +- utils/data_utils.py | 1 - 4 files changed, 183 insertions(+), 44 deletions(-) diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py index 914c0b44..39e955c8 100644 --- a/bins/tts/preprocess.py +++ b/bins/tts/preprocess.py @@ -88,11 +88,11 @@ def extract_phonme_sequences(dataset, output_path, cfg, dataset_types): dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) with open(dataset_file, "r") as f: metadata.extend(json.load(f)) - phone_extractor.extract_utt_phone_sequence(dataset, cfg, metadata) + phone_extractor.extract_utt_phone_sequence(cfg, metadata) def preprocess(cfg, args): - """Preprocess raw data of single or multiple datasets (in cfg.dataset) + """Proprocess raw data of single or multiple datasets (in cfg.dataset) Args: cfg (dict): dictionary that stores configurations diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md index 027c8a23..e6229238 100644 --- a/egs/tts/VITS/README.md +++ b/egs/tts/VITS/README.md @@ -3,8 +3,8 @@ [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/Text-to-Speech) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/Text-to-Speech) -In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning. - +In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning. The detailed instructions for training [single speaker](#single-speaker-vits) and [multi-speaker](#multi-speaker-vits) VITS can be found below: +## Single Speaker VITS There are four stages in total: 1. Data preparation @@ -20,7 +20,7 @@ There are four stages in total: ## 1. Data Preparation ### Dataset Download -You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, Hi-Fi TTS, LibriTTS, etc. We strongly recommend using LJSpeech to train single-speaker TTS model for the first time. While for training multi-speaker TTS model for the first time, we would recommend using Hi-Fi TTS. The process of downloading dataset has been detailed [here](../../datasets/README.md). +You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md). ### Configuration @@ -29,12 +29,10 @@ After downloading the dataset, you can set the dataset paths in `exp_config.jso ```json "dataset": [ "LJSpeech", - //"hifitts" ], "dataset_path": { // TODO: Fill in your dataset path "LJSpeech": "[LJSpeech dataset path]", - //"hifitts": "[Hi-Fi TTS dataset path] }, ``` @@ -42,22 +40,15 @@ After downloading the dataset, you can set the dataset paths in `exp_config.jso ### Configuration -In `exp_config.json`:
Specify the `log_dir` for saving the checkpoints and logs, specify the `processed_dir` for saving processed data. For preprocessing multi-speaker TTS dataset, set `extract_audio` and `use_spkid` to `true`: +Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`: ```json // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" "log_dir": "ckpts/tts", "preprocess": { - //"extract_audio": true,//set to true for multi-speaker TTS model - "use_phone": true, - // linguistic features - "extract_phone": true, - "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "data", - "sample_rate": 22050, //target sampling rate - "valid_file": "valid.json", //validation set - //"use_spkid": true, //set to true for multi-speaker TTS model + ... }, ``` @@ -76,16 +67,11 @@ sh egs/tts/VITS/run.sh --stage 1 ### Configuration We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. -For training multi-speaker TTS model, specify the `n_speakers` according to the number of speakers in your dataset(s) and set `multi_speaker_training` to `true`. -```json - "model": { - //"n_speakers": 10 //for multi-speaker TTS model: Fill in the number of speakers according to dataset used. The default value is 0 if not specified. - }, - "train": { - "batch_size": 16, - //"multi_speaker_training": true, //for multi-speaker TTS model - } +``` +"train": { + "batch_size": 16, + } ``` ### Train From Scratch @@ -153,13 +139,11 @@ For inference, you need to specify the following configurations when running `ru | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | | `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | -| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`.
For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. | -| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set.
For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | +| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. | | `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | -| `--infer_speaker_name` | The target speaker's voice to be synthesized.
(***Note: only applicable to multi-speaker TTS model***) | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | ### Run -#### Batch inference: For example, if you want to generate speech of all testing set split from LJSpeech, just run: ```bash @@ -170,28 +154,185 @@ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_dataset "LJSpeech" \ --infer_testing_set "test" ``` -The same procedure follows for inferencing on multi-speaker dataset, with ```LJSpeech``` replaced by ```hifitts```. + +Or, if you want to generate a single clip of speech from a given text, just run: + ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ - --infer_mode "batch" \ - --infer_dataset "hifitts" \ - --infer_testing_set "test" + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." +``` + +We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction. + +## Multi-speaker VITS +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download +You can use the commonly used multi-speaker TTS dataset to train TTS model, i.e., Hi-Fi TTS, LibriTTS etc. We strongly recommend you use Hi-Fi TTS to train TTS model for the first time. The process of downloading dataset is detailed [here](../../datasets/README.md). + +### Configuration + +After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "hifitts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "hifitts": "[Hi-Fi TTS dataset path]", + }, +``` + +## 2. Features Extraction + +### Configuration + +In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, specify the `processed_dir` for saving processed data, set `extract_audio` and `use_spkid` to `true`. + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + "extract_audio": true, + "use_phone": true, + // linguistic features + "extract_phone": true, + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + "sample_rate": 24000, //target sampling rate + "valid_file": "valid.json", //validation set + "use_spkid": true, //true: use speaker id for multi-speaker dataset + }, +``` + +### Run + +Run the `run.sh` as the preprocess stage (set `--stage 1`): + +```bash +sh egs/tts/VITS/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. Remember to specify the `n_speakers` according to the number of speakers in your dataset and set `multi_speaker_training` to `true`. + +```json + "model": { + // TODO: Fill in the number of speakers according to dataset used. The default value is 0 if not specified. + "n_speakers": 10 + }, + "train": { + "batch_size": 16, + "multi_speaker_training": true, + } +``` + +### Train From Scratch + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] +``` + +### Train From Existing Source + +We support training from existing source for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint. + +Setting `--resume true`, the training will resume from the **latest checkpoint** from the current `[YourExptName]` by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/tts/[YourExptName]/checkpoint`, run: + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ + --resume true +``` + +You can also choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to resume training from the checkpoint `Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]`, run: + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ + --resume true + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ +``` + +If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune the model from the checkpoint `Amphion/ckpts/tts/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, run: + + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ + --resume true + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ + --resume_type "finetune" ``` -#### Single text inference: -For single-speaker TTS model, if you want to generate a single clip of speech from a given text, just run: +> **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training. +> +> The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint. + +Here are some example scenarios to better understand how to use these arguments: +| Scenario | `--resume` | `--resume_from_ckpt_path` | `--resume_type` | +| ------ | -------- | ----------------------- | ------------- | +| You want to train from scratch | no | no | no | +| The machine breaks down during training and you want to resume training from the latest checkpoint | `true` | no | no | +| You find the latest model is overfitting and you want to re-train from the checkpoint before | `true` | `SpecificCheckpoint Path` | no | +| You want to fine-tune a model from another checkpoint | `true` | `SpecificCheckpoint Path` | `"finetune"` | + + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + + +## 4. Inference + +### Configuration + +For inference, you need to specify the following configurations when running `run.sh`: + + +| Parameters | Description | Example | +| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | +| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | +| `--infer_dataset` | The dataset used for inference. | For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test | For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | +| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | +| `--infer_speaker_name` | The target speaker's voice to be synthesized. | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | + +### Run +For example, if you want to generate speech from all testing set split from Hi-Fi TTS, just run: ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ - --infer_mode "single" \ - --infer_text "This is a clip of generated speech with the given text from a TTS model." + --infer_mode "batch" \ + --infer_dataset "hifitts" \ + --infer_testing_set "test" ``` -For multi-speaker TTS model, in addition to the above-mentioned arguments, you need to add ```infer_speaker_name``` argument, and run: +Or, if you want to generate a single clip of speech from a given text, just run: + ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ @@ -201,7 +342,7 @@ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_speaker_name "hifitts_92" ``` -We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction. Meanwhile, the pre-trained multi-speaker VITS model trained on Hi-Fi TTS will be released soon. Stay tuned. +We will release a pre-trained multi-speaker VITS model trained on Hi-Fi TTS soon. Stay tuned! ```bibtex diff --git a/processors/phone_extractor.py b/processors/phone_extractor.py index 45a832dc..26b59d8b 100644 --- a/processors/phone_extractor.py +++ b/processors/phone_extractor.py @@ -45,7 +45,7 @@ def __init__(self, cfg, dataset_name=None, phone_symbol_file=None): assert cfg.preprocess.lexicon_path != "" self.g2p_module = LexiconModule(cfg.preprocess.lexicon_path) else: - print("No support to", cfg.preprocess.phone_extractor) + print("No suppert to", cfg.preprocess.phone_extractor) raise def extract_phone(self, text): @@ -93,17 +93,16 @@ def save_dataset_phone_symbols_to_table(self): phone_symbol_dict.to_file(self.phone_symbols_file) -def extract_utt_phone_sequence(dataset, cfg, metadata): +def extract_utt_phone_sequence(cfg, metadata): """ Extract phone sequence from text Args: - dataset (str): name of dataset, e.g. opencpop cfg: config metadata: list of dict, each dict contains "Uid", "Text" """ - dataset_name = dataset + dataset_name = cfg.dataset[0] # output path out_path = os.path.join( diff --git a/utils/data_utils.py b/utils/data_utils.py index ea9b271b..e8762053 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -13,7 +13,6 @@ def intersperse(lst, item): - # Insert an item in between any consecutive elements of the given list, including beginning and end of list result = [item] * (len(lst) * 2 + 1) result[1::2] = lst return result From dd796ec40f276260aa92d34b039f0b65e7013c0d Mon Sep 17 00:00:00 2001 From: zyingt Date: Thu, 8 Feb 2024 14:25:11 +0800 Subject: [PATCH 07/13] Merged README.md, updated comments --- bins/tts/preprocess.py | 2 +- egs/tts/VITS/README.md | 226 +++++++--------------------------- egs/tts/VITS/exp_config.json | 14 ++- processors/phone_extractor.py | 5 +- utils/data_utils.py | 1 + 5 files changed, 56 insertions(+), 192 deletions(-) diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py index 39e955c8..1b57b677 100644 --- a/bins/tts/preprocess.py +++ b/bins/tts/preprocess.py @@ -88,7 +88,7 @@ def extract_phonme_sequences(dataset, output_path, cfg, dataset_types): dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) with open(dataset_file, "r") as f: metadata.extend(json.load(f)) - phone_extractor.extract_utt_phone_sequence(cfg, metadata) + phone_extractor.extract_utt_phone_sequence(dataset, cfg, metadata) def preprocess(cfg, args): diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md index e6229238..0225442a 100644 --- a/egs/tts/VITS/README.md +++ b/egs/tts/VITS/README.md @@ -3,8 +3,8 @@ [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/Text-to-Speech) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/Text-to-Speech) -In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning. The detailed instructions for training [single speaker](#single-speaker-vits) and [multi-speaker](#multi-speaker-vits) VITS can be found below: -## Single Speaker VITS +In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning. + There are four stages in total: 1. Data preparation @@ -20,7 +20,7 @@ There are four stages in total: ## 1. Data Preparation ### Dataset Download -You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md). +You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, Hi-Fi TTS, LibriTTS, etc. We strongly recommend using LJSpeech to train single-speaker TTS model for the first time. While for training multi-speaker TTS model for the first time, we would recommend using Hi-Fi TTS. The process of downloading dataset has been detailed [here](../../datasets/README.md). ### Configuration @@ -29,10 +29,12 @@ After downloading the dataset, you can set the dataset paths in `exp_config.jso ```json "dataset": [ "LJSpeech", + //"hifitts" ], "dataset_path": { // TODO: Fill in your dataset path "LJSpeech": "[LJSpeech dataset path]", + //"hifitts": "[Hi-Fi TTS dataset path] }, ``` @@ -40,15 +42,22 @@ After downloading the dataset, you can set the dataset paths in `exp_config.jso ### Configuration -Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`: +In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, specify the `processed_dir` for saving processed data. For preprocessing multi-speaker TTS dataset, set `extract_audio` and `use_spkid` to `true`: ```json // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" "log_dir": "ckpts/tts", "preprocess": { + //"extract_audio": true, + "use_phone": true, + // linguistic features + "extract_phone": true, + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "data", - ... + "sample_rate": 22050, //target sampling rate + "valid_file": "valid.json", //validation set + //"use_spkid": true, //use speaker ID to train multi-speaker TTS model }, ``` @@ -67,11 +76,16 @@ sh egs/tts/VITS/run.sh --stage 1 ### Configuration We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. +For training multi-speaker TTS model, specify the `n_speakers` according to the number of speakers in your dataset(s) and set `multi_speaker_training` to `true`. -``` -"train": { - "batch_size": 16, - } +```json + "model": { + //"n_speakers": 10 //Number of speakers in the dataset(s) used. The default value is 0 if not specified. + }, + "train": { + "batch_size": 16, + //"multi_speaker_training": true, + } ``` ### Train From Scratch @@ -139,210 +153,56 @@ For inference, you need to specify the following configurations when running `ru | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | | `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | -| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. | -| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. | +| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`.
For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set.
For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | | `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | +| `--infer_speaker_name` | The target speaker's voice to be synthesized.
(***Note: only applicable to multi-speaker TTS model***) | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | ### Run -For example, if you want to generate speech of all testing set split from LJSpeech, just run: +#### Single text inference: +For single-speaker TTS model, if you want to generate a single clip of speech from a given text, just run: ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ - --infer_mode "batch" \ - --infer_dataset "LJSpeech" \ - --infer_testing_set "test" + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." ``` -Or, if you want to generate a single clip of speech from a given text, just run: - +For multi-speaker TTS model, in addition to the above-mentioned arguments, you need to add ```infer_speaker_name``` argument, and run: ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ --infer_mode "single" \ - --infer_text "This is a clip of generated speech with the given text from a TTS model." -``` - -We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction. - -## Multi-speaker VITS -There are four stages in total: - -1. Data preparation -2. Features extraction -3. Training -4. Inference - -> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: -> ```bash -> cd Amphion -> ``` - -## 1. Data Preparation - -### Dataset Download -You can use the commonly used multi-speaker TTS dataset to train TTS model, i.e., Hi-Fi TTS, LibriTTS etc. We strongly recommend you use Hi-Fi TTS to train TTS model for the first time. The process of downloading dataset is detailed [here](../../datasets/README.md). - -### Configuration - -After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. - -```json - "dataset": [ - "hifitts", - ], - "dataset_path": { - // TODO: Fill in your dataset path - "hifitts": "[Hi-Fi TTS dataset path]", - }, -``` - -## 2. Features Extraction - -### Configuration - -In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, specify the `processed_dir` for saving processed data, set `extract_audio` and `use_spkid` to `true`. - -```json - // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" - "log_dir": "ckpts/tts", - "preprocess": { - "extract_audio": true, - "use_phone": true, - // linguistic features - "extract_phone": true, - "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" - // TODO: Fill in the output data path. The default value is "Amphion/data" - "processed_dir": "data", - "sample_rate": 24000, //target sampling rate - "valid_file": "valid.json", //validation set - "use_spkid": true, //true: use speaker id for multi-speaker dataset - }, -``` - -### Run - -Run the `run.sh` as the preprocess stage (set `--stage 1`): - -```bash -sh egs/tts/VITS/run.sh --stage 1 -``` - -> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. - -## 3. Training - -### Configuration - -We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. Remember to specify the `n_speakers` according to the number of speakers in your dataset and set `multi_speaker_training` to `true`. - -```json - "model": { - // TODO: Fill in the number of speakers according to dataset used. The default value is 0 if not specified. - "n_speakers": 10 - }, - "train": { - "batch_size": 16, - "multi_speaker_training": true, - } -``` - -### Train From Scratch - -Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. - -```bash -sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] -``` - -### Train From Existing Source - -We support training from existing source for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint. - -Setting `--resume true`, the training will resume from the **latest checkpoint** from the current `[YourExptName]` by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/tts/[YourExptName]/checkpoint`, run: - -```bash -sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ - --resume true -``` - -You can also choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to resume training from the checkpoint `Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]`, run: - -```bash -sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ - --resume true - --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ -``` - -If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune the model from the checkpoint `Amphion/ckpts/tts/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, run: - - -```bash -sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ - --resume true - --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ - --resume_type "finetune" + --infer_text "This is a clip of generated speech with the given text from a TTS model." \ + --infer_speaker_name "hifitts_92" ``` -> **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training. -> -> The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint. - -Here are some example scenarios to better understand how to use these arguments: -| Scenario | `--resume` | `--resume_from_ckpt_path` | `--resume_type` | -| ------ | -------- | ----------------------- | ------------- | -| You want to train from scratch | no | no | no | -| The machine breaks down during training and you want to resume training from the latest checkpoint | `true` | no | no | -| You find the latest model is overfitting and you want to re-train from the checkpoint before | `true` | `SpecificCheckpoint Path` | no | -| You want to fine-tune a model from another checkpoint | `true` | `SpecificCheckpoint Path` | `"finetune"` | - - -> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. - - -## 4. Inference - -### Configuration - -For inference, you need to specify the following configurations when running `run.sh`: - - -| Parameters | Description | Example | -| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | -| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | -| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | -| `--infer_dataset` | The dataset used for inference. | For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. | -| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test | For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | -| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | -| `--infer_speaker_name` | The target speaker's voice to be synthesized. | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | - -### Run -For example, if you want to generate speech from all testing set split from Hi-Fi TTS, just run: +#### Batch inference: +For single-speaker TTS model, if you want to generate speech of all testing set split from LJSpeech, just run: ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ --infer_mode "batch" \ - --infer_dataset "hifitts" \ - --infer_testing_set "test" + --infer_dataset "LJSpeech" \ + --infer_testing_set "test" ``` - -Or, if you want to generate a single clip of speech from a given text, just run: - +For multi-speaker TTS model, if you want to generate speech of all testing set split from Hi-Fi TTS, the same procedure follows from above, with ```LJSpeech``` replaced by ```hifitts```. ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ - --infer_mode "single" \ - --infer_text "This is a clip of generated speech with the given text from a TTS model." \ - --infer_speaker_name "hifitts_92" + --infer_mode "batch" \ + --infer_dataset "hifitts" \ + --infer_testing_set "test" ``` -We will release a pre-trained multi-speaker VITS model trained on Hi-Fi TTS soon. Stay tuned! + +We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction. Meanwhile, the pre-trained multi-speaker VITS model trained on Hi-Fi TTS will be released soon. Stay tuned. ```bibtex diff --git a/egs/tts/VITS/exp_config.json b/egs/tts/VITS/exp_config.json index d030aac9..721412f2 100644 --- a/egs/tts/VITS/exp_config.json +++ b/egs/tts/VITS/exp_config.json @@ -2,11 +2,13 @@ "base_config": "config/vits.json", "model_type": "VITS", "dataset": [ - "LJSpeech" + "LJSpeech", + //"hifitts" ], "dataset_path": { // TODO: Fill in your dataset path - "LJSpeech": "[LJSpeech dataset path]" + "LJSpeech": "[LJSpeech dataset path]", + //"hifitts": "[Hi-Fi TTS dataset path] }, // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" "log_dir": "ckpts/tts", @@ -15,18 +17,18 @@ "use_phone": true, // linguistic features "extract_phone": true, - "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "data", "sample_rate": 22050, // target sampling rate "valid_file": "valid.json", // validation set - //"use_spkid": true // true: use speaker ID for multi-speaker dataset + //"use_spkid": true // use speaker ID to train multi-speaker TTS model }, "model":{ - //"n_speakers": 10 // specify number of speakers for multi-speaker dataset + //"n_speakers": 10 // number of speakers in the dataset(s) used. The default value is 0 if not specified. }, "train": { "batch_size": 16, - //"multi_speaker_training": true //true: enable multi-speaker training + //"multi_speaker_training": true } } \ No newline at end of file diff --git a/processors/phone_extractor.py b/processors/phone_extractor.py index 26b59d8b..37959e1f 100644 --- a/processors/phone_extractor.py +++ b/processors/phone_extractor.py @@ -93,16 +93,17 @@ def save_dataset_phone_symbols_to_table(self): phone_symbol_dict.to_file(self.phone_symbols_file) -def extract_utt_phone_sequence(cfg, metadata): +def extract_utt_phone_sequence(dataset, cfg, metadata): """ Extract phone sequence from text Args: + dataset (str): name of dataset, e.g. opencpop cfg: config metadata: list of dict, each dict contains "Uid", "Text" """ - dataset_name = cfg.dataset[0] + dataset_name = dataset # output path out_path = os.path.join( diff --git a/utils/data_utils.py b/utils/data_utils.py index e8762053..ea9b271b 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -13,6 +13,7 @@ def intersperse(lst, item): + # Insert an item in between any consecutive elements of the given list, including beginning and end of list result = [item] * (len(lst) * 2 + 1) result[1::2] = lst return result From 8c910d76ff71de59a96bde659813401cf035f69e Mon Sep 17 00:00:00 2001 From: zyingt Date: Thu, 8 Feb 2024 15:49:45 +0800 Subject: [PATCH 08/13] Fixed typos --- bins/tts/preprocess.py | 2 +- processors/phone_extractor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py index 1b57b677..914c0b44 100644 --- a/bins/tts/preprocess.py +++ b/bins/tts/preprocess.py @@ -92,7 +92,7 @@ def extract_phonme_sequences(dataset, output_path, cfg, dataset_types): def preprocess(cfg, args): - """Proprocess raw data of single or multiple datasets (in cfg.dataset) + """Preprocess raw data of single or multiple datasets (in cfg.dataset) Args: cfg (dict): dictionary that stores configurations diff --git a/processors/phone_extractor.py b/processors/phone_extractor.py index 37959e1f..45a832dc 100644 --- a/processors/phone_extractor.py +++ b/processors/phone_extractor.py @@ -45,7 +45,7 @@ def __init__(self, cfg, dataset_name=None, phone_symbol_file=None): assert cfg.preprocess.lexicon_path != "" self.g2p_module = LexiconModule(cfg.preprocess.lexicon_path) else: - print("No suppert to", cfg.preprocess.phone_extractor) + print("No support to", cfg.preprocess.phone_extractor) raise def extract_phone(self, text): From a2735050366771054f34a618e1bdb61aacff7846 Mon Sep 17 00:00:00 2001 From: zyingt Date: Mon, 19 Feb 2024 15:09:15 +0800 Subject: [PATCH 09/13] Enabling intersperse function for single-speaker VITS, added example usage, black format --- evaluation/metrics/similarity/models/RawNetModel.py | 4 +++- models/tta/autoencoder/autoencoder.py | 6 ++++-- models/tta/autoencoder/autoencoder_dataset.py | 6 ++++-- models/tta/ldm/audioldm_dataset.py | 6 ++++-- models/tts/base/tts_dataset.py | 2 +- models/tts/vits/vits_inference.py | 2 +- modules/diffusion/karras/karras_diffusion.py | 4 +++- modules/transformer/mh_attention.py | 4 +++- optimizer/optimizers.py | 4 +++- utils/data_utils.py | 8 +++++++- 10 files changed, 33 insertions(+), 13 deletions(-) diff --git a/evaluation/metrics/similarity/models/RawNetModel.py b/evaluation/metrics/similarity/models/RawNetModel.py index cfe8a555..3e687cf7 100755 --- a/evaluation/metrics/similarity/models/RawNetModel.py +++ b/evaluation/metrics/similarity/models/RawNetModel.py @@ -121,7 +121,9 @@ def forward(self, x): w = self.attention(global_x) mu = torch.sum(x * w, dim=2) - sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)) + sg = torch.sqrt( + (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4) + ) x = torch.cat((mu, sg), 1) diff --git a/models/tta/autoencoder/autoencoder.py b/models/tta/autoencoder/autoencoder.py index fec67718..d97bfdc9 100644 --- a/models/tta/autoencoder/autoencoder.py +++ b/models/tta/autoencoder/autoencoder.py @@ -250,7 +250,8 @@ def forward(self, x): # TODO: Encoder1d -class Encoder1d(Encoder2d): ... +class Encoder1d(Encoder2d): + ... class Decoder2d(nn.Module): @@ -350,7 +351,8 @@ def forward(self, z): # TODO: decoder1d -class Decoder1d(Decoder2d): ... +class Decoder1d(Decoder2d): + ... class AutoencoderKL(nn.Module): diff --git a/models/tta/autoencoder/autoencoder_dataset.py b/models/tta/autoencoder/autoencoder_dataset.py index 764008c8..75828eea 100644 --- a/models/tta/autoencoder/autoencoder_dataset.py +++ b/models/tta/autoencoder/autoencoder_dataset.py @@ -106,7 +106,9 @@ def __call__(self, batch): return packed_batch_features -class AutoencoderKLTestDataset(BaseTestDataset): ... +class AutoencoderKLTestDataset(BaseTestDataset): + ... -class AutoencoderKLTestCollator(BaseTestCollator): ... +class AutoencoderKLTestCollator(BaseTestCollator): + ... diff --git a/models/tta/ldm/audioldm_dataset.py b/models/tta/ldm/audioldm_dataset.py index 2bb176b9..344eeb51 100644 --- a/models/tta/ldm/audioldm_dataset.py +++ b/models/tta/ldm/audioldm_dataset.py @@ -145,7 +145,9 @@ def __call__(self, batch): return packed_batch_features -class AudioLDMTestDataset(BaseTestDataset): ... +class AudioLDMTestDataset(BaseTestDataset): + ... -class AudioLDMTestCollator(BaseTestCollator): ... +class AudioLDMTestCollator(BaseTestCollator): + ... diff --git a/models/tts/base/tts_dataset.py b/models/tts/base/tts_dataset.py index 0142b160..fc85afb9 100644 --- a/models/tts/base/tts_dataset.py +++ b/models/tts/base/tts_dataset.py @@ -209,7 +209,7 @@ def __init__(self, cfg, dataset, is_valid=False): phon_id_collator = phoneIDCollation(cfg, dataset=dataset) sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq) - if cfg.preprocess.add_blank and cfg.train.multi_speaker_training: + if cfg.preprocess.add_blank: sequence = intersperse(sequence, 0) self.utt2seq[utt] = sequence diff --git a/models/tts/vits/vits_inference.py b/models/tts/vits/vits_inference.py index b0a45ebc..5e28858a 100644 --- a/models/tts/vits/vits_inference.py +++ b/models/tts/vits/vits_inference.py @@ -121,7 +121,7 @@ def inference_for_single_utterance( ) phone_id_seq = phon_id_collator.get_phone_id_sequence(self.cfg, phone_seq) - if self.cfg.preprocess.add_blank and self.cfg.train.multi_speaker_training: + if self.cfg.preprocess.add_blank: phone_id_seq = intersperse(phone_id_seq, 0) # convert phone sequence to phone id sequence diff --git a/modules/diffusion/karras/karras_diffusion.py b/modules/diffusion/karras/karras_diffusion.py index 7e366496..f590c25a 100644 --- a/modules/diffusion/karras/karras_diffusion.py +++ b/modules/diffusion/karras/karras_diffusion.py @@ -465,7 +465,9 @@ def to_d(x, sigma, denoised): def get_ancestral_step(sigma_from, sigma_to): """Calculates the noise level (sigma_down) to step down to and the amount of noise to add (sigma_up) when doing an ancestral sampling step.""" - sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5 + sigma_up = ( + sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2 + ) ** 0.5 sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5 return sigma_down, sigma_up diff --git a/modules/transformer/mh_attention.py b/modules/transformer/mh_attention.py index bf576ca7..b5f0f730 100644 --- a/modules/transformer/mh_attention.py +++ b/modules/transformer/mh_attention.py @@ -344,7 +344,9 @@ def forward( ( 1 if key_padding_mask is not None - else 0 if attn_mask is not None else None + else 0 + if attn_mask is not None + else None ), ) diff --git a/optimizer/optimizers.py b/optimizer/optimizers.py index 94260aee..180689e0 100644 --- a/optimizer/optimizers.py +++ b/optimizer/optimizers.py @@ -697,7 +697,9 @@ def _size_update( denom = scale_exp_avg_sq.sqrt() + eps - scale_step = -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom + scale_step = ( + -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom + ) is_too_small = param_rms < param_min_rms is_too_large = param_rms > param_max_rms diff --git a/utils/data_utils.py b/utils/data_utils.py index ea9b271b..8c0bc2ff 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -13,7 +13,13 @@ def intersperse(lst, item): - # Insert an item in between any consecutive elements of the given list, including beginning and end of list + """ + Insert an item in between any two consecutive elements of the given list, including beginning and end of list + + Example: + >>> intersperse(0, [1, 74, 5, 31]) + [0, 1, 0, 74, 0, 5, 0, 31, 0] + """ result = [item] * (len(lst) * 2 + 1) result[1::2] = lst return result From ab84579a9ba923fdc6b1d3e2d278497f843096bf Mon Sep 17 00:00:00 2001 From: zyingt Date: Wed, 21 Feb 2024 15:14:41 +0800 Subject: [PATCH 10/13] enhance model loading compatibility --- models/tts/base/tts_inferece.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/models/tts/base/tts_inferece.py b/models/tts/base/tts_inferece.py index cb09a4d4..f49ace0f 100644 --- a/models/tts/base/tts_inferece.py +++ b/models/tts/base/tts_inferece.py @@ -12,6 +12,7 @@ from tqdm import tqdm from accelerate.logging import get_logger from torch.utils.data import DataLoader +from safetensors.torch import load_file from abc import abstractmethod @@ -162,7 +163,16 @@ def _load_model( ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True) checkpoint_path = ls[0] - self.accelerator.load_state(str(checkpoint_path)) + if ( + Path(os.path.join(checkpoint_path, "model.safetensors")).exists() + and accelerate.__version__ < "0.25" + ): + self.model.load_state_dict( + load_file(os.path.join(checkpoint_path, "model.safetensors")), + strict=False, + ) + else: + self.accelerator.load_state(str(checkpoint_path)) return str(checkpoint_path) def inference(self): From 9a743bed2d50412c6715197abec05ae12d3ed2b5 Mon Sep 17 00:00:00 2001 From: zyingt Date: Thu, 22 Feb 2024 15:01:14 +0800 Subject: [PATCH 11/13] black format --- evaluation/metrics/similarity/models/RawNetModel.py | 4 +--- models/tta/autoencoder/autoencoder.py | 6 ++---- models/tta/autoencoder/autoencoder_dataset.py | 6 ++---- models/tta/ldm/audioldm_dataset.py | 6 ++---- modules/diffusion/karras/karras_diffusion.py | 4 +--- modules/transformer/mh_attention.py | 4 +--- optimizer/optimizers.py | 4 +--- 7 files changed, 10 insertions(+), 24 deletions(-) diff --git a/evaluation/metrics/similarity/models/RawNetModel.py b/evaluation/metrics/similarity/models/RawNetModel.py index 3e687cf7..cfe8a555 100755 --- a/evaluation/metrics/similarity/models/RawNetModel.py +++ b/evaluation/metrics/similarity/models/RawNetModel.py @@ -121,9 +121,7 @@ def forward(self, x): w = self.attention(global_x) mu = torch.sum(x * w, dim=2) - sg = torch.sqrt( - (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4) - ) + sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)) x = torch.cat((mu, sg), 1) diff --git a/models/tta/autoencoder/autoencoder.py b/models/tta/autoencoder/autoencoder.py index d97bfdc9..fec67718 100644 --- a/models/tta/autoencoder/autoencoder.py +++ b/models/tta/autoencoder/autoencoder.py @@ -250,8 +250,7 @@ def forward(self, x): # TODO: Encoder1d -class Encoder1d(Encoder2d): - ... +class Encoder1d(Encoder2d): ... class Decoder2d(nn.Module): @@ -351,8 +350,7 @@ def forward(self, z): # TODO: decoder1d -class Decoder1d(Decoder2d): - ... +class Decoder1d(Decoder2d): ... class AutoencoderKL(nn.Module): diff --git a/models/tta/autoencoder/autoencoder_dataset.py b/models/tta/autoencoder/autoencoder_dataset.py index 75828eea..764008c8 100644 --- a/models/tta/autoencoder/autoencoder_dataset.py +++ b/models/tta/autoencoder/autoencoder_dataset.py @@ -106,9 +106,7 @@ def __call__(self, batch): return packed_batch_features -class AutoencoderKLTestDataset(BaseTestDataset): - ... +class AutoencoderKLTestDataset(BaseTestDataset): ... -class AutoencoderKLTestCollator(BaseTestCollator): - ... +class AutoencoderKLTestCollator(BaseTestCollator): ... diff --git a/models/tta/ldm/audioldm_dataset.py b/models/tta/ldm/audioldm_dataset.py index 344eeb51..2bb176b9 100644 --- a/models/tta/ldm/audioldm_dataset.py +++ b/models/tta/ldm/audioldm_dataset.py @@ -145,9 +145,7 @@ def __call__(self, batch): return packed_batch_features -class AudioLDMTestDataset(BaseTestDataset): - ... +class AudioLDMTestDataset(BaseTestDataset): ... -class AudioLDMTestCollator(BaseTestCollator): - ... +class AudioLDMTestCollator(BaseTestCollator): ... diff --git a/modules/diffusion/karras/karras_diffusion.py b/modules/diffusion/karras/karras_diffusion.py index f590c25a..7e366496 100644 --- a/modules/diffusion/karras/karras_diffusion.py +++ b/modules/diffusion/karras/karras_diffusion.py @@ -465,9 +465,7 @@ def to_d(x, sigma, denoised): def get_ancestral_step(sigma_from, sigma_to): """Calculates the noise level (sigma_down) to step down to and the amount of noise to add (sigma_up) when doing an ancestral sampling step.""" - sigma_up = ( - sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2 - ) ** 0.5 + sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5 sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5 return sigma_down, sigma_up diff --git a/modules/transformer/mh_attention.py b/modules/transformer/mh_attention.py index b5f0f730..bf576ca7 100644 --- a/modules/transformer/mh_attention.py +++ b/modules/transformer/mh_attention.py @@ -344,9 +344,7 @@ def forward( ( 1 if key_padding_mask is not None - else 0 - if attn_mask is not None - else None + else 0 if attn_mask is not None else None ), ) diff --git a/optimizer/optimizers.py b/optimizer/optimizers.py index 180689e0..94260aee 100644 --- a/optimizer/optimizers.py +++ b/optimizer/optimizers.py @@ -697,9 +697,7 @@ def _size_update( denom = scale_exp_avg_sq.sqrt() + eps - scale_step = ( - -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom - ) + scale_step = -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom is_too_small = param_rms < param_min_rms is_too_large = param_rms > param_max_rms From d6c857c22efb9d96b11f6f6ad27ee9c27627ff02 Mon Sep 17 00:00:00 2001 From: Liumeng Xue <33707885+lmxue@users.noreply.github.com> Date: Fri, 23 Feb 2024 21:11:31 +0800 Subject: [PATCH 12/13] Update exp_config.json --- egs/tts/VITS/exp_config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/tts/VITS/exp_config.json b/egs/tts/VITS/exp_config.json index 721412f2..3a2332f2 100644 --- a/egs/tts/VITS/exp_config.json +++ b/egs/tts/VITS/exp_config.json @@ -25,10 +25,10 @@ //"use_spkid": true // use speaker ID to train multi-speaker TTS model }, "model":{ - //"n_speakers": 10 // number of speakers in the dataset(s) used. The default value is 0 if not specified. + //"n_speakers": 10 // number of speakers, greater than or equal to the number of speakers in the dataset(s) used. The default value is 0 if not specified. }, "train": { "batch_size": 16, //"multi_speaker_training": true } -} \ No newline at end of file +} From 6046ea8eda63ffd8cd36b6c1f549c5c56ebfa27c Mon Sep 17 00:00:00 2001 From: Liumeng Xue <33707885+lmxue@users.noreply.github.com> Date: Fri, 23 Feb 2024 21:21:15 +0800 Subject: [PATCH 13/13] Update README.md Fix typos and revise the explanation for `n_speaker` --- egs/tts/VITS/README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md index 0225442a..ff489419 100644 --- a/egs/tts/VITS/README.md +++ b/egs/tts/VITS/README.md @@ -42,7 +42,7 @@ After downloading the dataset, you can set the dataset paths in `exp_config.jso ### Configuration -In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, specify the `processed_dir` for saving processed data. For preprocessing multi-speaker TTS dataset, set `extract_audio` and `use_spkid` to `true`: +In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, and specify the `processed_dir` for saving processed data. For preprocessing the multi-speaker TTS dataset, set `extract_audio` and `use_spkid` to `true`: ```json // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" @@ -63,7 +63,7 @@ In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, ### Run -Run the `run.sh` as the preproces stage (set `--stage 1`): +Run the `run.sh` as the preprocess stage (set `--stage 1`): ```bash sh egs/tts/VITS/run.sh --stage 1 @@ -75,8 +75,8 @@ sh egs/tts/VITS/run.sh --stage 1 ### Configuration -We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. -For training multi-speaker TTS model, specify the `n_speakers` according to the number of speakers in your dataset(s) and set `multi_speaker_training` to `true`. +We provide the default hyparameters in the `exp_config.json`. They can work on a single NVIDIA-24g GPU. You can adjust them based on your GPU machines. +For training the multi-speaker TTS model, specify the `n_speakers` value to be greater (used for new speaker fine-tuning) than or equal to the number of speakers in your dataset(s) and set `multi_speaker_training` to `true`. ```json "model": { @@ -90,7 +90,7 @@ For training multi-speaker TTS model, specify the `n_speakers` according to the ### Train From Scratch -Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. +Run the `run.sh` as the training stage (set `--stage 2`). Specify an experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. ```bash sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] @@ -154,13 +154,13 @@ For inference, you need to specify the following configurations when running `ru | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | | `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | | `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`.
For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. | -| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set.
For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from the test set as template testing set.
For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | | `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | -| `--infer_speaker_name` | The target speaker's voice to be synthesized.
(***Note: only applicable to multi-speaker TTS model***) | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | +| `--infer_speaker_name` | The target speaker's voice is to be synthesized.
(***Note: only applicable to multi-speaker TTS model***) | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | ### Run #### Single text inference: -For single-speaker TTS model, if you want to generate a single clip of speech from a given text, just run: +For the single-speaker TTS model, if you want to generate a single clip of speech from a given text, just run: ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ @@ -170,7 +170,7 @@ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_text "This is a clip of generated speech with the given text from a TTS model." ``` -For multi-speaker TTS model, in addition to the above-mentioned arguments, you need to add ```infer_speaker_name``` argument, and run: +For the multi-speaker TTS model, in addition to the above-mentioned arguments, you need to add ```infer_speaker_name``` argument, and run: ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ @@ -181,7 +181,7 @@ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ ``` #### Batch inference: -For single-speaker TTS model, if you want to generate speech of all testing set split from LJSpeech, just run: +For the single-speaker TTS model, if you want to generate speech of all testing sets split from LJSpeech, just run: ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ @@ -191,7 +191,7 @@ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_dataset "LJSpeech" \ --infer_testing_set "test" ``` -For multi-speaker TTS model, if you want to generate speech of all testing set split from Hi-Fi TTS, the same procedure follows from above, with ```LJSpeech``` replaced by ```hifitts```. +For the multi-speaker TTS model, if you want to generate speech of all testing sets split from Hi-Fi TTS, the same procedure follows from above, with ```LJSpeech``` replaced by ```hifitts```. ```bash sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ @@ -213,4 +213,4 @@ We released a pre-trained Amphion VITS model trained on LJSpeech. So you can dow pages={5530--5540}, year={2021}, } -``` \ No newline at end of file +```