From a2c17086a9b98c0b5d4cf121b0ac10f0f6703f0c Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Fri, 14 Apr 2023 16:29:11 -0700 Subject: [PATCH 01/25] Add tts adapter tutorial Signed-off-by: hsiehjackson --- .../tts/FastPitch_Adapter_Finetuning.ipynb | 970 ++++++++++++++++++ .../FastPitch_MultiSpeaker_Pretraining.ipynb | 868 ++++++++++++++++ 2 files changed, 1838 insertions(+) create mode 100644 tutorials/tts/FastPitch_Adapter_Finetuning.ipynb create mode 100644 tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb new file mode 100644 index 000000000000..1c9a377911af --- /dev/null +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -0,0 +1,970 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bceec759", + "metadata": {}, + "source": [ + "# FastPitch Adapter Finetuning\n", + "\n", + "This notebook is designed to provide a guide on how to run FastPitch Adapter Finetuning Pipeline. It contains the following sections:\n", + "1. **Transform pre-trained FastPitch checkpoint to adapter-compatible checkpoint**\n", + "2. **Fine-tune FastPitch on adaptation data**: fine-tune pre-trained multi-speaker FastPitch for a new speaker\n", + "* Dataset Preparation: download dataset and extract manifest files. (duration more than 15 mins)\n", + "* Preprocessing: add absolute audio paths in manifest, calculate pitch stats.\n", + "* Training: fine-tune frozen multispeaker FastPitch with trainable adapters.\n", + "3. **Fine-tune HiFiGAN on adaptation data**: fine-tune a vocoder for the fine-tuned multi-speaker FastPitch\n", + "* Dataset Preparation: extract mel-spectrograms from fine-tuned FastPitch.\n", + "* Training: fine-tune HiFiGAN with fine-tuned adaptation data.\n", + "4. **Inference**: generate speech from adpated FastPitch\n", + "* Load Model: load pre-trained multi-speaker FastPitch with fine-tuned adapters.\n", + "* Output Audio: generate audio files." + ] + }, + { + "cell_type": "markdown", + "id": "9363d17a", + "metadata": {}, + "source": [ + "# License\n", + "\n", + "> Copyright 2023 NVIDIA. All Rights Reserved.\n", + "> \n", + "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "> you may not use this file except in compliance with the License.\n", + "> You may obtain a copy of the License at\n", + "> \n", + "> http://www.apache.org/licenses/LICENSE-2.0\n", + "> \n", + "> Unless required by applicable law or agreed to in writing, software\n", + "> distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "> See the License for the specific language governing permissions and\n", + "> limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f952558a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can either run this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies# .\n", + "\"\"\"\n", + "BRANCH = 'tts_fastpitch_speaker_encoder'\n", + "# # If you're using Colab and not running locally, uncomment and run this cell.\n", + "# !apt-get install sox libsndfile1 ffmpeg\n", + "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5c4698e", + "metadata": {}, + "outputs": [], + "source": [ + "!wandb login #PASTE_WANDB_APIKEY_HERE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65b37d86", + "metadata": {}, + "outputs": [], + "source": [ + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"\"\n", + "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d5ec43c", + "metadata": {}, + "outputs": [], + "source": [ + "sample_rate = 44100\n", + "# Store all python script\n", + "codedir = 'NeMoTTS' \n", + "# Store all manifest and audios\n", + "datadir = 'NeMoTTS_dataset'\n", + "# Store all related text-normalized files\n", + "normdir = 'NeMoTTS_normalize_files'\n", + "# Store all supplementary files\n", + "suppdir = \"NeMoTTS_sup_data\"\n", + "# Store all config files\n", + "confdir = \"NeMoTTS_conf\"\n", + "# Store all training logs\n", + "logsdir = \"NeMoTTS_logs\"\n", + "# Store all mel-spectrograms for vocoder training\n", + "melsdir = \"NeMoTTS_mels\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a235bc58", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import shutil\n", + "import nemo\n", + "import torch\n", + "import numpy as np\n", + "\n", + "from pathlib import Path\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "id": "7117eebe", + "metadata": {}, + "source": [ + "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fe5e9da", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.core import adapter_mixins\n", + "from omegaconf import DictConfig, OmegaConf, open_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17a3892c", + "metadata": {}, + "outputs": [], + "source": [ + "def update_model_config_to_support_adapter(config) -> DictConfig:\n", + " with open_dict(config):\n", + " enc_adapter_metadata = adapter_mixins.get_registered_adapter(config.input_fft._target_)\n", + " if enc_adapter_metadata is not None:\n", + " config.input_fft._target_ = enc_adapter_metadata.adapter_class_path\n", + "\n", + " dec_adapter_metadata = adapter_mixins.get_registered_adapter(config.output_fft._target_)\n", + " if dec_adapter_metadata is not None:\n", + " config.output_fft._target_ = dec_adapter_metadata.adapter_class_path\n", + "\n", + " pitch_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.pitch_predictor._target_)\n", + " if pitch_predictor_adapter_metadata is not None:\n", + " config.pitch_predictor._target_ = pitch_predictor_adapter_metadata.adapter_class_path\n", + "\n", + " duration_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.duration_predictor._target_)\n", + " if duration_predictor_adapter_metadata is not None:\n", + " config.duration_predictor._target_ = duration_predictor_adapter_metadata.adapter_class_path\n", + "\n", + " aligner_adapter_metadata = adapter_mixins.get_registered_adapter(config.alignment_module._target_)\n", + " if aligner_adapter_metadata is not None:\n", + " config.alignment_module._target_ = aligner_adapter_metadata.adapter_class_path\n", + "\n", + " return config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f5390d2", + "metadata": {}, + "outputs": [], + "source": [ + "state = torch.load(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", + "state['hyper_parameters']['cfg'] = update_model_config_to_support_adapter(state['hyper_parameters']['cfg'])\n", + "torch.save(state, YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e07ac1c0", + "metadata": {}, + "outputs": [], + "source": [ + "shutil.copyfile(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT, \"FastPitch.pt\")\n", + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"FastPitch.pt\"\n", + "shutil.copyfile(YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT, \"HifiGan.pt\")\n", + "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = \"HifiGan.pt\"" + ] + }, + { + "cell_type": "markdown", + "id": "f32e7bb1", + "metadata": {}, + "source": [ + "# 2. Fine-tune FastPitch on adaptation data" + ] + }, + { + "cell_type": "markdown", + "id": "7d45a5d4", + "metadata": {}, + "source": [ + "## a. Data Preparation\n", + "For our tutorial, we use small part of VCTK dataset with a new target speaker (p267). Usually, the audios should have total duration more than 15 mintues." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9799debe", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {datadir} && cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset.tar.gz && tar zxf vctk_subset.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aceea64f", + "metadata": {}, + "outputs": [], + "source": [ + "manidir = f\"{datadir}/vctk_subset\"\n", + "!ls {manidir}" + ] + }, + { + "cell_type": "markdown", + "id": "6d64cb74", + "metadata": {}, + "source": [ + "For simplicity, we use original dev set as training set and original test set as validation set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36ad65ae", + "metadata": {}, + "outputs": [], + "source": [ + "train_manifest = os.path.abspath(os.path.join(manidir, 'train.json'))\n", + "valid_manifest = os.path.abspath(os.path.join(manidir, 'dev.json'))" + ] + }, + { + "cell_type": "markdown", + "id": "2cdcd15d", + "metadata": {}, + "source": [ + "## b. Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f95711cc", + "metadata": {}, + "outputs": [], + "source": [ + "# additional files\n", + "!mkdir -p {normdir} && cd {normdir} \\\n", + "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", + "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-052722 \\" + ] + }, + { + "cell_type": "markdown", + "id": "34452374", + "metadata": {}, + "source": [ + "### Add absolute file path in manifest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d89c1712", + "metadata": {}, + "outputs": [], + "source": [ + "def json_reader(filename):\n", + " lines = []\n", + " with open(filename) as f:\n", + " for line in f: lines.append(json.loads(line))\n", + " return lines\n", + "\n", + "def json_writer(manifest, filename):\n", + " with open(filename, 'w') as fout:\n", + " for m in manifest: fout.write(json.dumps(m) + '\\n') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a713d486", + "metadata": {}, + "outputs": [], + "source": [ + "train_datas = json_reader(train_manifest)\n", + "for m in train_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "json_writer(train_datas, train_manifest)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2021c507", + "metadata": {}, + "outputs": [], + "source": [ + "valid_datas = json_reader(valid_manifest)\n", + "for m in valid_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "3ae42b30", + "metadata": {}, + "source": [ + "### Calibrate speaker id to start from 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce86f9c5", + "metadata": {}, + "outputs": [], + "source": [ + "train_datas = json_reader(train_manifest)\n", + "for m in train_datas: m['old_speaker'], m['speaker'] = m['speaker'], 0\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "valid_datas = json_reader(valid_manifest)\n", + "for m in valid_datas: m['old_speaker'], m['speaker'] = m['speaker'], 0\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "560f0f94", + "metadata": {}, + "source": [ + "### Calculate Pitch Stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "108be984", + "metadata": {}, + "outputs": [], + "source": [ + "import librosa\n", + "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", + "from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd69c623", + "metadata": {}, + "outputs": [], + "source": [ + "def get_pitch(sample): \n", + " rel_audio_path = Path(sample[\"audio_filepath\"]).relative_to(base_data_dir).with_suffix(\"\")\n", + " rel_audio_path_as_text_id = str(rel_audio_path).replace(\"/\", \"_\")\n", + " pitch_filepath = os.path.join(pitch_dir, f\"{rel_audio_path_as_text_id}.pt\")\n", + " \n", + " if os.path.exists(pitch_filepath):\n", + " pitch = torch.load(pitch_filepath).numpy()\n", + "\n", + " else:\n", + " features = wave_model.process(\n", + " sample[\"audio_filepath\"]\n", + " )\n", + " voiced_tuple = librosa.pyin(\n", + " features.numpy(),\n", + " fmin=librosa.note_to_hz('C2'),\n", + " fmax=librosa.note_to_hz('C7'),\n", + " frame_length=2048,\n", + " sr=44100,\n", + " fill_na=0.0,\n", + " )\n", + " pitch = voiced_tuple[0]\n", + " torch.save(torch.from_numpy(pitch).float(), pitch_filepath)\n", + " \n", + " return pitch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc620c67", + "metadata": {}, + "outputs": [], + "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", + "pitch_dir = os.path.join(suppdir, 'pitch')\n", + "os.makedirs(suppdir, exist_ok=True)\n", + "os.makedirs(pitch_dir, exist_ok=True)\n", + "\n", + "train_pitchs = []\n", + "train_datas = json_reader(train_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", + "for m in tqdm(train_datas): train_pitchs.append(get_pitch(m))\n", + " \n", + "valid_datas = json_reader(valid_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", + "for m in tqdm(valid_datas): get_pitch(m)\n", + "\n", + "train_pitchs = np.concatenate(train_pitchs)\n", + "pitch_mean = float(np.mean(train_pitchs))\n", + "pitch_std = float(np.std(train_pitchs))\n", + "\n", + "with open(os.path.join(manidir, 'pitch_stats.json'), 'w') as f:\n", + " json.dump({'pitch':[pitch_mean, pitch_std]}, f)" + ] + }, + { + "cell_type": "markdown", + "id": "c8eb5b1d", + "metadata": {}, + "source": [ + "## c. Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e3b0d80", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {confdir} && cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/fastpitch_align_44100_adapter.yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d26d2c2b", + "metadata": {}, + "outputs": [], + "source": [ + "!cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/fastpitch_finetune_adapters.py" + ] + }, + { + "cell_type": "markdown", + "id": "8d21f6b5", + "metadata": {}, + "source": [ + "### Important notes\n", + "* **+init_from_ptl_ckpt**: initialize with a multi-speaker FastPitch checkpoint\n", + "* **model.n_speakers=1**: the number of speakers in the data. There is only 1 for now, \n", + "* **~model.speaker_encoder.lookup_module**: remove the pre-trained looked-up speaker embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2845624e", + "metadata": {}, + "outputs": [], + "source": [ + "# Normally 100 epochs (15 mins)\n", + "!(python {codedir}/fastpitch_finetune_adapters.py \\\n", + "--config-path={os.path.abspath(confdir)} \\\n", + "--config-name=fastpitch_align_44100_adapter.yaml \\\n", + "+init_from_ptl_ckpt={YOUR_PRETRAINED_FASTPITCH_CHECKPOINT} \\\n", + "sample_rate=44100 \\\n", + "train_dataset={train_manifest} \\\n", + "validation_datasets={valid_manifest} \\\n", + "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id', 'reference_audio']\" \\\n", + "sup_data_path={suppdir} \\\n", + "pitch_mean={pitch_mean} \\\n", + "pitch_std={pitch_std} \\\n", + "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", + "heteronyms_path={normdir}/heteronyms-052722 \\\n", + "model.n_speakers=1 \\\n", + "~model.speaker_encoder.lookup_module \\\n", + "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", + "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", + "model.output_fft.condition_types=\"['add', 'layernorm']\" \\\n", + "model.duration_predictor.condition_types=\"['add', 'layernorm']\" \\\n", + "model.pitch_predictor.condition_types=\"['add', 'layernorm']\" \\\n", + "model.alignment_module.condition_types=\"['add']\" \\\n", + "model.train_ds.dataloader_params.batch_size=8 \\\n", + "model.validation_ds.dataloader_params.batch_size=8 \\\n", + "model.train_ds.dataloader_params.num_workers=8 \\\n", + "model.validation_ds.dataloader_params.num_workers=8 \\\n", + "+model.text_tokenizer.add_blank_at=True \\\n", + "model.optim.name=adam \\\n", + "model.optim.lr=2e-4 \\\n", + "model.optim.weight_decay=0.0 \\\n", + "~model.optim.sched \\\n", + "exp_manager.exp_dir={logsdir} \\\n", + "+exp_manager.create_wandb_logger=True \\\n", + "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-finetune-adaptation\" \\\n", + "+exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", + "+exp_manager.checkpoint_callback_params.save_top_k=-1 \\\n", + "trainer.max_epochs=10 \\\n", + "trainer.check_val_every_n_epoch=10 \\\n", + "trainer.log_every_n_steps=1 \\\n", + "trainer.devices=1 \\\n", + "trainer.strategy=ddp \\\n", + "trainer.precision=32 \\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44cbeee8", + "metadata": {}, + "outputs": [], + "source": [ + "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "YOUR_FINETUNED_ADAPTER_CHECKPOINT = list(last_checkpoint_dir.glob('adapters.pt'))[0]\n", + "YOUR_FINETUNED_ADAPTER_CHECKPOINT" + ] + }, + { + "cell_type": "markdown", + "id": "b477be0c", + "metadata": {}, + "source": [ + "# 4. Fine-tune HiFiGAN on adaptation data" + ] + }, + { + "cell_type": "markdown", + "id": "6f690dbc", + "metadata": {}, + "source": [ + "## a. Dataset Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc2415f7", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator\n", + "from nemo.collections.tts.models import FastPitchModel\n", + "from collections import defaultdict\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd8fb4b7", + "metadata": {}, + "outputs": [], + "source": [ + "def gen_spectrogram(index, manifest, speaker_to_index, base_data_dir):\n", + " \n", + " record = manifest[index]\n", + " audio_file = record[\"audio_filepath\"]\n", + " \n", + " if '.wav' in audio_file:\n", + " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".wav\", \".npy\")))\n", + " \n", + " if '.flac' in audio_file:\n", + " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".flac\", \".npy\")))\n", + " \n", + " if os.path.exists(save_path):\n", + " return save_path\n", + " \n", + " if \"normalized_text\" in record:\n", + " text = spec_model.parse(record[\"normalized_text\"], normalize=False)\n", + " else:\n", + " text = spec_model.parse(record['text'])\n", + " \n", + " text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=spec_model.device).unsqueeze(0)\n", + " \n", + " audio = wave_model.process(audio_file).unsqueeze(0).to(device=spec_model.device)\n", + " audio_len = torch.tensor(audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", + " spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len) \n", + " \n", + " attn_prior = torch.from_numpy(beta_binomial_interpolator(spect_len.item(), text_len.item())).unsqueeze(0).to(spec_model.device)\n", + " \n", + " reference_pool = speaker_to_index[record[\"speaker\"]] - set([index]) if len(speaker_to_index[record[\"speaker\"]]) > 1 else speaker_to_index[record[\"speaker\"]]\n", + " reference_sample = manifest[random.sample(reference_pool, 1)[0]]\n", + " reference_audio = wave_model.process(reference_sample[\"audio_filepath\"]).unsqueeze(0).to(device=spec_model.device)\n", + " reference_audio_length = torch.tensor(reference_audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", + " reference_spec, reference_spec_len = spec_model.preprocessor(input_signal=reference_audio, length=reference_audio_length) \n", + " \n", + " \n", + " with torch.no_grad():\n", + " spectrogram = spec_model.forward(\n", + " text=text, \n", + " input_lens=text_len,\n", + " spec=spect, \n", + " mel_lens=spect_len, \n", + " attn_prior=attn_prior,\n", + " reference_spec=reference_spec,\n", + " reference_spec_lens=reference_spec_len,\n", + " )[0]\n", + " \n", + " spec = spectrogram[0].to('cpu').numpy()\n", + " np.save(save_path, spec)\n", + " return save_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e9a0d81", + "metadata": {}, + "outputs": [], + "source": [ + "# Pretrained FastPitch Weights\n", + "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", + "\n", + "# Load Adapter Weights\n", + "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", + "spec_model.freeze()\n", + "spec_model.unfreeze_enabled_adapters()\n", + "spec_model.eval().cuda()\n", + "\n", + "beta_binomial_interpolator = BetaBinomialInterpolator()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd1b06e4", + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(melsdir, exist_ok=True)\n", + "\n", + "# Train\n", + "train_datas = json_reader(train_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(train_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", + "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", + "\n", + "for i, record in enumerate(tqdm(train_datas)):\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, train_datas, speaker_to_index, base_data_dir)\n", + "\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "\n", + "# Valid\n", + "valid_datas = json_reader(valid_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(valid_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", + "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", + "\n", + "for i, record in enumerate(tqdm(valid_datas)):\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, valid_datas, speaker_to_index, base_data_dir)\n", + "\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "2ecf4794", + "metadata": {}, + "source": [ + "## b. Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f599ff2b", + "metadata": {}, + "outputs": [], + "source": [ + "!cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/hifigan/hifigan_44100.yaml\n", + "!cd {confdir} && mkdir -p model/train_ds && cd model/train_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml \n", + "!cd {confdir} && mkdir -p model/validation_ds && cd model/validation_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/validation_ds/val_ds_finetune.yaml\n", + "!cd {confdir} && mkdir -p model/generator && cd model/generator && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/generator/v1_44100.yaml\n", + "!cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/hifigan_finetune.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19de8999", + "metadata": {}, + "outputs": [], + "source": [ + "# Normally 500 epochs (30 mins)\n", + "!(python {codedir}/hifigan_finetune.py \\\n", + "--config-path={os.path.abspath(confdir)} \\\n", + "--config-name=hifigan_44100.yaml \\\n", + "train_dataset={train_manifest} \\\n", + "validation_datasets={valid_manifest} \\\n", + "+init_from_ptl_ckpt={YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT} \\\n", + "model.train_ds.dataloader_params.batch_size=32 \\\n", + "model.optim.lr=0.0001 \\\n", + "+trainer.max_epochs=5 \\\n", + "trainer.check_val_every_n_epoch=5 \\\n", + "model/train_ds=train_ds_finetune \\\n", + "model/validation_ds=val_ds_finetune \\\n", + "trainer.devices=1 \\\n", + "trainer.strategy='ddp' \\\n", + "trainer.precision=16 \\\n", + "exp_manager.exp_dir={logsdir} \\\n", + "exp_manager.create_wandb_logger=True \\\n", + "exp_manager.wandb_logger_kwargs.name=\"tutorial-HiFiGAN-finetune-adaptation\" \\\n", + "exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d959a60f", + "metadata": {}, + "outputs": [], + "source": [ + "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX.ckpt\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", + "YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT" + ] + }, + { + "cell_type": "markdown", + "id": "e476784f", + "metadata": {}, + "source": [ + "# 3. Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6759111", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.tts.models import HifiGanModel\n", + "import IPython.display as ipd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "97736efb", + "metadata": {}, + "source": [ + "## a. Load Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a228df69", + "metadata": {}, + "outputs": [], + "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93ac4bdd", + "metadata": {}, + "outputs": [], + "source": [ + "# FastPitch\n", + "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", + "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", + "spec_model.freeze()\n", + "spec_model.unfreeze_enabled_adapters()\n", + "spec_model = spec_model.eval().cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "414c2710", + "metadata": {}, + "outputs": [], + "source": [ + "# HiFiGAN\n", + "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT).eval().cuda()" + ] + }, + { + "cell_type": "markdown", + "id": "5cf1c315", + "metadata": {}, + "source": [ + "## b. Output Audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac94f171", + "metadata": {}, + "outputs": [], + "source": [ + "def gt_spectrogram(audio_path, wave_model, spec_gen_model):\n", + " features = wave_model.process(audio_path, trim=False)\n", + " audio, audio_length = features, torch.tensor(features.shape[0]).long()\n", + " audio = audio.unsqueeze(0).to(device=spec_gen_model.device)\n", + " audio_length = audio_length.unsqueeze(0).to(device=spec_gen_model.device)\n", + " with torch.no_grad():\n", + " spectrogram, spec_len = spec_gen_model.preprocessor(input_signal=audio, length=audio_length)\n", + " return spectrogram, spec_len\n", + "\n", + "def gen_spectrogram(text, spec_gen_model, reference_spec, reference_spec_lens):\n", + " parsed = spec_gen_model.parse(text)\n", + " with torch.no_grad(): \n", + " spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, \n", + " reference_spec=reference_spec, \n", + " reference_spec_lens=reference_spec_lens)\n", + "\n", + " return spectrogram\n", + " \n", + "def synth_audio(vocoder_model, spectrogram): \n", + " with torch.no_grad(): \n", + " audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)\n", + " if isinstance(audio, torch.Tensor):\n", + " audio = audio.to('cpu').numpy()\n", + " return audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfe7c6b8", + "metadata": {}, + "outputs": [], + "source": [ + "# Reference Audio\n", + "with open(train_manifest, \"r\") as f:\n", + " for i, line in enumerate(f):\n", + " reference_record = json.loads(line)\n", + " break\n", + " \n", + "# Validatation Audio\n", + "num_val = 3\n", + "val_records = []\n", + "with open(valid_manifest, \"r\") as f:\n", + " for i, line in enumerate(f):\n", + " val_records.append(json.loads(line))\n", + " if len(val_records) >= num_val:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "375f77c9", + "metadata": {}, + "outputs": [], + "source": [ + "for i, val_record in enumerate(val_records):\n", + " reference_spec, reference_spec_lens = gt_spectrogram(reference_record['audio_filepath'], wave_model, spec_model)\n", + " reference_spec = reference_spec.to(spec_model.device)\n", + " spec_pred = gen_spectrogram(val_record['text'], spec_model,\n", + " reference_spec=reference_spec, \n", + " reference_spec_lens=reference_spec_lens)\n", + "\n", + " audio_gen = synth_audio(vocoder_model, spec_pred)\n", + " \n", + " audio_ref = ipd.Audio(reference_record['audio_filepath'], rate=sample_rate)\n", + " audio_gt = ipd.Audio(val_record['audio_filepath'], rate=sample_rate)\n", + " audio_gen = ipd.Audio(audio_gen, rate=sample_rate)\n", + " \n", + " print(\"------\")\n", + " print(f\"Text: {val_record['text']}\")\n", + " print('Reference Audio')\n", + " ipd.display(audio_ref)\n", + " print('Ground Truth Audio')\n", + " ipd.display(audio_gt)\n", + " print('Synthesized Audio')\n", + " ipd.display(audio_gen)\n", + " plt.imshow(spec_pred[0].to('cpu').numpy(), origin=\"lower\", aspect=\"auto\")\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa738cb2", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c573b954", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_FINETUNED_ADAPTER_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "805137d7", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2038cf9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb new file mode 100644 index 000000000000..85df41e2979b --- /dev/null +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -0,0 +1,868 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "369c55f1", + "metadata": {}, + "source": [ + "# FastPitch MultiSpeaker Pretraining\n", + "\n", + "This notebook is designed to provide a guide on how to run FastPitch MultiSpeaker Pretraining Pipeline. It contains the following sections:\n", + "1. **Pre-train FastPitch on multi-speaker data**: pre-train a multi-speaker FastPitch\n", + "* Dataset Preparation: download dataset and extract manifest files.\n", + "* Preprocessing: add absolute audio paths in manifest, calculate pitch stats.\n", + "* Training: pre-train multispeaker FastPitch\n", + " * Input: we introduce additional speaker id and reference audio.\n", + " * Speaker: we have looked-up speaker embedding and speaker encoder. \n", + " * Condition: we can condition pitch/duration predictors, mel-spectrogram decoder, aligner, and layernorm layers.\n", + "2. **Fine-tune HiFiGAN on multi-speaker data**: fine-tune a vocoder for the pre-trained multi-speaker FastPitch\n", + "* Dataset Preparation: extract mel-spectrograms from pre-trained FastPitch.\n", + "* Training: fine-tune HiFiGAN with pre-trained multi-speaker data.\n", + "3. **Inference**: generate speech from pre-trained multi-speaker FastPitch\n", + "* Load Model: load pre-trained multi-speaker FastPitch.\n", + "* Output Audio: generate audio files." + ] + }, + { + "cell_type": "markdown", + "id": "a5f7fa32", + "metadata": {}, + "source": [ + "# License\n", + "\n", + "> Copyright 2023 NVIDIA. All Rights Reserved.\n", + "> \n", + "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "> you may not use this file except in compliance with the License.\n", + "> You may obtain a copy of the License at\n", + "> \n", + "> http://www.apache.org/licenses/LICENSE-2.0\n", + "> \n", + "> Unless required by applicable law or agreed to in writing, software\n", + "> distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "> See the License for the specific language governing permissions and\n", + "> limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7055e07", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can either run this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies# .\n", + "\"\"\"\n", + "BRANCH = 'tts_fastpitch_speaker_encoder'\n", + "# # If you're using Colab and not running locally, uncomment and run this cell.\n", + "# !apt-get install sox libsndfile1 ffmpeg\n", + "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a71b9226", + "metadata": {}, + "outputs": [], + "source": [ + "!wandb login #PASTE_WANDB_APIKEY_HERE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ad7daf8", + "metadata": {}, + "outputs": [], + "source": [ + "sample_rate = 44100\n", + "# Store all python script\n", + "codedir = 'NeMoTTS' \n", + "# Store all manifest and audios\n", + "datadir = 'NeMoTTS_dataset'\n", + "# Store all related text-normalized files\n", + "normdir = 'NeMoTTS_normalize_files'\n", + "# Store all supplementary files\n", + "suppdir = \"NeMoTTS_sup_data\"\n", + "# Store all config files\n", + "confdir = \"NeMoTTS_conf\"\n", + "# Store all training logs\n", + "logsdir = \"NeMoTTS_logs\"\n", + "# Store all mel-spectrograms for vocoder training\n", + "melsdir = \"NeMoTTS_mels\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e32d8df7", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import nemo\n", + "import torch\n", + "import numpy as np\n", + "\n", + "from pathlib import Path\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "id": "74f6a27e", + "metadata": {}, + "source": [ + "# 1. Pre-train FastPitch on multi-speaker data" + ] + }, + { + "cell_type": "markdown", + "id": "92b28f3f", + "metadata": {}, + "source": [ + "## a. Dataset Preparation\n", + "For our tutorial, we use the subset of VCTK dataset with 5 speakers (p225-p229). The audios have 48 kHz sampling rate, we downsample to 44.1 kHz in this tutorial. \n", + "You can read more about dataset [here](https://datashare.ed.ac.uk/handle/10283/2950)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcdec070", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {datadir} && cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset_multispeaker.tar.gz && tar zxf vctk_subset_multispeaker.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d313cda", + "metadata": {}, + "outputs": [], + "source": [ + "manidir = f\"{datadir}/vctk_subset_multispeaker\"\n", + "!ls {manidir}" + ] + }, + { + "cell_type": "markdown", + "id": "2466d86f", + "metadata": {}, + "source": [ + "For simplicity, we use original dev set as training set and original test set as validation set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccb4c7c5", + "metadata": {}, + "outputs": [], + "source": [ + "train_manifest = os.path.abspath(os.path.join(manidir, 'train.json'))\n", + "valid_manifest = os.path.abspath(os.path.join(manidir, 'dev.json'))" + ] + }, + { + "cell_type": "markdown", + "id": "262e7ef5", + "metadata": {}, + "source": [ + "## b. Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bf29c9c", + "metadata": {}, + "outputs": [], + "source": [ + "# additional files\n", + "!mkdir -p {normdir} && cd {normdir} \\\n", + "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", + "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-052722 \\" + ] + }, + { + "cell_type": "markdown", + "id": "fd260436", + "metadata": {}, + "source": [ + "### Add absoluate audio path in manifest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08c0c87d", + "metadata": {}, + "outputs": [], + "source": [ + "def json_reader(filename):\n", + " lines = []\n", + " with open(filename) as f:\n", + " for line in f: lines.append(json.loads(line))\n", + " return lines\n", + "\n", + "def json_writer(manifest, filename):\n", + " with open(filename, 'w') as fout:\n", + " for m in manifest: fout.write(json.dumps(m) + '\\n') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32b2a36e", + "metadata": {}, + "outputs": [], + "source": [ + "train_datas = json_reader(train_manifest)\n", + "for m in train_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "valid_datas = json_reader(valid_manifest)\n", + "for m in valid_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "2417ad44", + "metadata": {}, + "source": [ + "### Calibrate speaker id to start from 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad3d5037", + "metadata": {}, + "outputs": [], + "source": [ + "train_datas = json_reader(train_manifest)\n", + "speaker2id = {s: _id for _id, s in enumerate(set([m['speaker'] for m in train_datas]))}\n", + "for m in train_datas: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "valid_datas = json_reader(valid_manifest)\n", + "for m in valid_datas: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "7a2e96db", + "metadata": {}, + "source": [ + "### Calculate Pitch Stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eb311c5", + "metadata": {}, + "outputs": [], + "source": [ + "import librosa\n", + "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", + "from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21777c97", + "metadata": {}, + "outputs": [], + "source": [ + "def get_pitch(sample): \n", + " rel_audio_path = Path(sample[\"audio_filepath\"]).relative_to(base_data_dir).with_suffix(\"\")\n", + " rel_audio_path_as_text_id = str(rel_audio_path).replace(\"/\", \"_\")\n", + " pitch_filepath = os.path.join(pitch_dir, f\"{rel_audio_path_as_text_id}.pt\")\n", + " \n", + " if os.path.exists(pitch_filepath):\n", + " pitch = torch.load(pitch_filepath).numpy()\n", + "\n", + " else:\n", + " features = wave_model.process(\n", + " sample[\"audio_filepath\"]\n", + " )\n", + " voiced_tuple = librosa.pyin(\n", + " features.numpy(),\n", + " fmin=librosa.note_to_hz('C2'),\n", + " fmax=librosa.note_to_hz('C7'),\n", + " frame_length=2048,\n", + " sr=sample_rate,\n", + " fill_na=0.0,\n", + " )\n", + " pitch = voiced_tuple[0]\n", + " torch.save(torch.from_numpy(pitch).float(), pitch_filepath)\n", + " \n", + " return pitch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b2a6f19", + "metadata": {}, + "outputs": [], + "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", + "pitch_dir = os.path.join(suppdir, 'pitch')\n", + "os.makedirs(suppdir, exist_ok=True)\n", + "os.makedirs(pitch_dir, exist_ok=True)\n", + "\n", + "train_pitchs = []\n", + "train_datas = json_reader(train_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", + "for m in tqdm(train_datas): train_pitchs.append(get_pitch(m))\n", + " \n", + "valid_datas = json_reader(valid_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", + "for m in tqdm(valid_datas): get_pitch(m)\n", + "\n", + "train_pitchs = np.concatenate(train_pitchs)\n", + "pitch_mean = float(np.mean(train_pitchs))\n", + "pitch_std = float(np.std(train_pitchs))\n", + "\n", + "with open(os.path.join(manidir, 'pitch_stats.json'), 'w') as f:\n", + " json.dump({'pitch':[pitch_mean, pitch_std]}, f)" + ] + }, + { + "cell_type": "markdown", + "id": "32425bca", + "metadata": {}, + "source": [ + "## c. Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd3471cd", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {confdir} && cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/fastpitch_align_44100_adapter.yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c71051e", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {codedir} && cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/fastpitch.py" + ] + }, + { + "cell_type": "markdown", + "id": "792c0ba8", + "metadata": {}, + "source": [ + "### Important notes\n", + "* [Input] **speaker_id** in **sub_data_types**: each data has an unique speaker index (start from 0) in the input.\n", + "* [Input] **reference_audio** in **sub_data_types**: each data has a reference audio (from the same speaker) in the input.\n", + "* [Speaker] **model.n_speakers**: model gets the speaker size. \n", + "* [Speaker] **model.speaker_encoder.lookup_module**: model creates lookup table to extract speaker information from speaker id.\n", + "* [Speaker] **model.speaker_encoder.gst_module**: model creates global style token to extract speaker information from reference audio.\n", + "* [Condition] **condition_types=\"['add', 'layernorm']\"**: insert conditions with `add` operation to inputs and `layernorm` operation to layernorms." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34f50e59", + "metadata": {}, + "outputs": [], + "source": [ + "# Normally 200 epochs\n", + "!(python {codedir}/fastpitch.py \\\n", + "--config-path={os.path.abspath(confdir)} \\\n", + "--config-name=fastpitch_align_44100_adapter.yaml \\\n", + "+init_from_pretrained_model=\"tts_en_fastpitch\" \\\n", + "sample_rate={sample_rate} \\\n", + "train_dataset={train_manifest} \\\n", + "validation_datasets={valid_manifest} \\\n", + "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id','reference_audio']\" \\\n", + "sup_data_path={suppdir} \\\n", + "pitch_mean={pitch_mean} \\\n", + "pitch_std={pitch_std} \\\n", + "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", + "heteronyms_path={normdir}/heteronyms-052722 \\\n", + "model.n_speakers=5 \\\n", + "model.speaker_encoder.lookup_module._target_=\"nemo.collections.tts.modules.submodules.LookupTable\" \\\n", + "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", + "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", + "model.output_fft.condition_types=\"['add', 'layernorm']\" \\\n", + "model.duration_predictor.condition_types=\"['add', 'layernorm']\" \\\n", + "model.pitch_predictor.condition_types=\"['add', 'layernorm']\" \\\n", + "model.alignment_module.condition_types=\"['add']\" \\\n", + "model.train_ds.dataloader_params.batch_size=8 \\\n", + "model.validation_ds.dataloader_params.batch_size=8 \\\n", + "model.train_ds.dataloader_params.num_workers=8 \\\n", + "model.validation_ds.dataloader_params.num_workers=8 \\\n", + "model.train_ds.dataset.max_duration=20 \\\n", + "model.validation_ds.dataset.max_duration=20 \\\n", + "model.validation_ds.dataset.min_duration=0.1 \\\n", + "+model.text_tokenizer.add_blank_at=True \\\n", + "exp_manager.exp_dir={logsdir} \\\n", + "+exp_manager.create_wandb_logger=True \\\n", + "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-pretrain-multispeaker\" \\\n", + "+exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", + "trainer.max_epochs=20 \\\n", + "trainer.check_val_every_n_epoch=20 \\\n", + "trainer.log_every_n_steps=1 \\\n", + "trainer.devices=-1 \\\n", + "trainer.strategy=ddp \\\n", + "trainer.precision=32 \\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0a115ba", + "metadata": {}, + "outputs": [], + "source": [ + "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch--val_loss=XXX-epoch=XXX-last.ckpt\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT" + ] + }, + { + "cell_type": "markdown", + "id": "b4c430d1", + "metadata": {}, + "source": [ + "# 2. Fine-tune HiFiGAN on multi-speaker data" + ] + }, + { + "cell_type": "markdown", + "id": "a12d558c", + "metadata": {}, + "source": [ + "## a. Dataset Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b49320ea", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator\n", + "from nemo.collections.tts.models import FastPitchModel\n", + "from collections import defaultdict\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a503861d", + "metadata": {}, + "outputs": [], + "source": [ + "def gen_spectrogram(index, manifest, speaker_to_index, base_data_dir):\n", + " \n", + " record = manifest[index]\n", + " audio_file = record[\"audio_filepath\"]\n", + " \n", + " if '.wav' in audio_file:\n", + " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".wav\", \".npy\")))\n", + " \n", + " if '.flac' in audio_file:\n", + " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".flac\", \".npy\")))\n", + " \n", + " if os.path.exists(save_path):\n", + " return save_path\n", + " \n", + " if \"normalized_text\" in record:\n", + " text = spec_model.parse(record[\"normalized_text\"], normalize=False)\n", + " else:\n", + " text = spec_model.parse(record['text'])\n", + " \n", + " text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=spec_model.device).unsqueeze(0)\n", + " \n", + " audio = wave_model.process(audio_file).unsqueeze(0).to(device=spec_model.device)\n", + " audio_len = torch.tensor(audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", + " spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len) \n", + " attn_prior = torch.from_numpy(beta_binomial_interpolator(spect_len.item(), text_len.item())).unsqueeze(0).to(spec_model.device)\n", + " \n", + " speaker = torch.tensor([record['speaker']]).to(spec_model.device)\n", + " \n", + " reference_pool = speaker_to_index[record[\"speaker\"]] - set([index]) if len(speaker_to_index[record[\"speaker\"]]) > 1 else speaker_to_index[record[\"speaker\"]]\n", + " reference_sample = manifest[random.sample(reference_pool, 1)[0]]\n", + " reference_audio = wave_model.process(reference_sample[\"audio_filepath\"]).unsqueeze(0).to(device=spec_model.device)\n", + " reference_audio_length = torch.tensor(reference_audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", + " reference_spec, reference_spec_len = spec_model.preprocessor(input_signal=reference_audio, length=reference_audio_length) \n", + " \n", + " with torch.no_grad():\n", + " spectrogram = spec_model.forward(\n", + " text=text, \n", + " input_lens=text_len,\n", + " spec=spect, \n", + " mel_lens=spect_len, \n", + " attn_prior=attn_prior,\n", + " speaker=speaker,\n", + " reference_spec=reference_spec,\n", + " reference_spec_lens=reference_spec_len\n", + " )[0]\n", + " \n", + " spec = spectrogram[0].to('cpu').numpy()\n", + " np.save(save_path, spec)\n", + " return save_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f947676c", + "metadata": {}, + "outputs": [], + "source": [ + "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT).eval().cuda()\n", + "beta_binomial_interpolator = BetaBinomialInterpolator()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "146a5027", + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(melsdir, exist_ok=True)\n", + "\n", + "# Train\n", + "train_datas = json_reader(train_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", + "\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(train_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", + "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", + "\n", + "for i, record in enumerate(tqdm(train_datas)):\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, train_datas, speaker_to_index, base_data_dir)\n", + "\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "\n", + "# Valid\n", + "valid_datas = json_reader(valid_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", + "\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(valid_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", + "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", + "\n", + "for i, record in enumerate(tqdm(valid_datas)):\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, valid_datas, speaker_to_index, base_data_dir)\n", + "\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "59830d4b", + "metadata": {}, + "source": [ + "## b. Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3586b593", + "metadata": {}, + "outputs": [], + "source": [ + "!cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/hifigan/hifigan_44100.yaml\n", + "!cd {confdir} && mkdir -p model/train_ds && cd model/train_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml \n", + "!cd {confdir} && mkdir -p model/validation_ds && cd model/validation_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/validation_ds/val_ds_finetune.yaml\n", + "!cd {confdir} && mkdir -p model/generator && cd model/generator && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/generator/v1_44100.yaml\n", + "!cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/hifigan_finetune.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83ff072a", + "metadata": {}, + "outputs": [], + "source": [ + "# Normally 100 epochs\n", + "!(python {codedir}/hifigan_finetune.py \\\n", + "--config-path={os.path.abspath(confdir)} \\\n", + "--config-name=hifigan_44100.yaml \\\n", + "train_dataset={train_manifest} \\\n", + "validation_datasets={valid_manifest} \\\n", + "+init_from_pretrained_model=\"tts_en_hifitts_hifigan_ft_fastpitch\" \\\n", + "model.train_ds.dataloader_params.batch_size=32 \\\n", + "model.optim.lr=0.0001 \\\n", + "+trainer.max_epochs=5 \\\n", + "trainer.check_val_every_n_epoch=5 \\\n", + "model/train_ds=train_ds_finetune \\\n", + "model/validation_ds=val_ds_finetune \\\n", + "trainer.devices=-1 \\\n", + "trainer.strategy='ddp' \\\n", + "trainer.precision=16 \\\n", + "exp_manager.exp_dir={logsdir} \\\n", + "exp_manager.create_wandb_logger=True \\\n", + "exp_manager.wandb_logger_kwargs.name=\"tutorial-HiFiGAN-finetune-multispeaker\" \\\n", + "exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f999e8c", + "metadata": {}, + "outputs": [], + "source": [ + "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX-last.ckpt\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", + "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT" + ] + }, + { + "cell_type": "markdown", + "id": "b8da2e2b", + "metadata": {}, + "source": [ + "# 3. Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "495e095f", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.tts.models import HifiGanModel\n", + "import IPython.display as ipd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "9254244e", + "metadata": {}, + "source": [ + "## a. Load Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52ff6e88", + "metadata": {}, + "outputs": [], + "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a09cefcd", + "metadata": {}, + "outputs": [], + "source": [ + "# FastPitch\n", + "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT).eval().cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59277b2b", + "metadata": {}, + "outputs": [], + "source": [ + "# HiFiGAN\n", + "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT).eval().cuda()" + ] + }, + { + "cell_type": "markdown", + "id": "df5eae8c", + "metadata": {}, + "source": [ + "## b. Output Audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "234aea6d", + "metadata": {}, + "outputs": [], + "source": [ + "def gt_spectrogram(audio_path, wave_model, spec_gen_model):\n", + " features = wave_model.process(audio_path, trim=False)\n", + " audio, audio_length = features, torch.tensor(features.shape[0]).long()\n", + " audio = audio.unsqueeze(0).to(device=spec_gen_model.device)\n", + " audio_length = audio_length.unsqueeze(0).to(device=spec_gen_model.device)\n", + " with torch.no_grad():\n", + " spectrogram, spec_len = spec_gen_model.preprocessor(input_signal=audio, length=audio_length)\n", + " return spectrogram, spec_len\n", + "\n", + "def gen_spectrogram(text, spec_gen_model, speaker, reference_spec, reference_spec_lens):\n", + " parsed = spec_gen_model.parse(text)\n", + " speaker = torch.tensor([speaker]).long().to(device=spec_gen_model.device)\n", + " with torch.no_grad(): \n", + " spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, \n", + " speaker=speaker, \n", + " reference_spec=reference_spec, \n", + " reference_spec_lens=reference_spec_lens)\n", + "\n", + " return spectrogram\n", + " \n", + "def synth_audio(vocoder_model, spectrogram): \n", + " with torch.no_grad(): \n", + " audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)\n", + " if isinstance(audio, torch.Tensor):\n", + " audio = audio.to('cpu').numpy()\n", + " return audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf0be104", + "metadata": {}, + "outputs": [], + "source": [ + "# Reference Audio\n", + "reference_records = []\n", + "with open(train_manifest, \"r\") as f:\n", + " for i, line in enumerate(f):\n", + " reference_records.append(json.loads(line))\n", + "\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(reference_records): speaker_to_index[d.get('speaker', None)].append(i)\n", + " \n", + "# Validatation Audio\n", + "num_val = 3\n", + "val_records = []\n", + "with open(valid_manifest, \"r\") as f:\n", + " for i, line in enumerate(f):\n", + " val_records.append(json.loads(line))\n", + " if len(val_records) >= num_val:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e3328e3", + "metadata": {}, + "outputs": [], + "source": [ + "for i, val_record in enumerate(val_records):\n", + " reference_record = reference_records[speaker_to_index[val_record['speaker']][0]]\n", + " reference_spec, reference_spec_lens = gt_spectrogram(reference_record['audio_filepath'], wave_model, spec_model)\n", + " reference_spec = reference_spec.to(spec_model.device)\n", + " spec_pred = gen_spectrogram(val_record['text'], \n", + " spec_model,\n", + " speaker=val_record['speaker'], \n", + " reference_spec=reference_spec, \n", + " reference_spec_lens=reference_spec_lens)\n", + "\n", + " audio_gen = synth_audio(vocoder_model, spec_pred)\n", + " \n", + " audio_ref = ipd.Audio(reference_record['audio_filepath'], rate=sample_rate)\n", + " audio_gt = ipd.Audio(val_record['audio_filepath'], rate=sample_rate)\n", + " audio_gen = ipd.Audio(audio_gen, rate=sample_rate)\n", + " \n", + " print(\"------\")\n", + " print(f\"Text: {val_record['text']}\")\n", + " print('Reference Audio')\n", + " ipd.display(audio_ref)\n", + " print('Ground Truth Audio')\n", + " ipd.display(audio_gt)\n", + " print('Synthesized Audio')\n", + " ipd.display(audio_gen)\n", + " plt.imshow(spec_pred[0].to('cpu').numpy(), origin=\"lower\", aspect=\"auto\")\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c7ca983", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c158077", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "685905cd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 9692b7dd57c43848b4f0885a0d7ae0251b7bc4b5 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Mon, 17 Apr 2023 19:36:22 -0700 Subject: [PATCH 02/25] Update main tutorial Signed-off-by: hsiehjackson --- .../tts/FastPitch_Adapter_Finetuning.ipynb | 132 +++++++++--------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 124 ++++++++-------- 2 files changed, 129 insertions(+), 127 deletions(-) diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 1c9a377911af..26ef487830c4 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "bceec759", + "id": "d7754dcf", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "9363d17a", + "id": "91190b36", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f952558a", + "id": "71341824", "metadata": {}, "outputs": [], "source": [ @@ -58,7 +58,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", - "BRANCH = 'tts_fastpitch_speaker_encoder'\n", + "BRANCH = 'main'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b5c4698e", + "id": "587e2ccd", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65b37d86", + "id": "7877db29", "metadata": {}, "outputs": [], "source": [ @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d5ec43c", + "id": "deb47145", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a235bc58", + "id": "c7560cad", "metadata": {}, "outputs": [], "source": [ @@ -130,7 +130,7 @@ }, { "cell_type": "markdown", - "id": "7117eebe", + "id": "2c40e507", "metadata": {}, "source": [ "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7fe5e9da", + "id": "21f55cf8", "metadata": {}, "outputs": [], "source": [ @@ -150,7 +150,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17a3892c", + "id": "f88696b9", "metadata": {}, "outputs": [], "source": [ @@ -182,7 +182,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4f5390d2", + "id": "33bfaac9", "metadata": {}, "outputs": [], "source": [ @@ -194,19 +194,19 @@ { "cell_type": "code", "execution_count": null, - "id": "e07ac1c0", + "id": "12b65006", "metadata": {}, "outputs": [], "source": [ "shutil.copyfile(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT, \"FastPitch.pt\")\n", - "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"FastPitch.pt\"\n", "shutil.copyfile(YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT, \"HifiGan.pt\")\n", + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"FastPitch.pt\"\n", "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = \"HifiGan.pt\"" ] }, { "cell_type": "markdown", - "id": "f32e7bb1", + "id": "4eb540c9", "metadata": {}, "source": [ "# 2. Fine-tune FastPitch on adaptation data" @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "7d45a5d4", + "id": "126e8a04", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -224,7 +224,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9799debe", + "id": "f1442ea2", "metadata": {}, "outputs": [], "source": [ @@ -234,7 +234,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aceea64f", + "id": "a14ef048", "metadata": {}, "outputs": [], "source": [ @@ -244,7 +244,7 @@ }, { "cell_type": "markdown", - "id": "6d64cb74", + "id": "951b5f1a", "metadata": {}, "source": [ "For simplicity, we use original dev set as training set and original test set as validation set." @@ -253,7 +253,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36ad65ae", + "id": "4d9fd8f2", "metadata": {}, "outputs": [], "source": [ @@ -263,7 +263,7 @@ }, { "cell_type": "markdown", - "id": "2cdcd15d", + "id": "c51e0e87", "metadata": {}, "source": [ "## b. Preprocessing" @@ -272,7 +272,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f95711cc", + "id": "d02029ed", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ }, { "cell_type": "markdown", - "id": "34452374", + "id": "cfcc7db6", "metadata": {}, "source": [ "### Add absolute file path in manifest" @@ -293,7 +293,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d89c1712", + "id": "c0700d93", "metadata": {}, "outputs": [], "source": [ @@ -311,7 +311,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a713d486", + "id": "643cf612", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +323,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2021c507", + "id": "9faa1fdd", "metadata": {}, "outputs": [], "source": [ @@ -334,7 +334,7 @@ }, { "cell_type": "markdown", - "id": "3ae42b30", + "id": "087c29e7", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0" @@ -343,7 +343,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ce86f9c5", + "id": "6f2dad27", "metadata": {}, "outputs": [], "source": [ @@ -358,7 +358,7 @@ }, { "cell_type": "markdown", - "id": "560f0f94", + "id": "7f9a7a6e", "metadata": {}, "source": [ "### Calculate Pitch Stats" @@ -367,7 +367,7 @@ { "cell_type": "code", "execution_count": null, - "id": "108be984", + "id": "8b7179f5", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd69c623", + "id": "fd41d74a", "metadata": {}, "outputs": [], "source": [ @@ -412,7 +412,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bc620c67", + "id": "016b039e", "metadata": {}, "outputs": [], "source": [ @@ -440,7 +440,7 @@ }, { "cell_type": "markdown", - "id": "c8eb5b1d", + "id": "cd87cceb", "metadata": {}, "source": [ "## c. Training" @@ -449,7 +449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4e3b0d80", + "id": "c7724add", "metadata": {}, "outputs": [], "source": [ @@ -459,7 +459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d26d2c2b", + "id": "1a969914", "metadata": {}, "outputs": [], "source": [ @@ -468,19 +468,18 @@ }, { "cell_type": "markdown", - "id": "8d21f6b5", + "id": "96071e54", "metadata": {}, "source": [ "### Important notes\n", "* **+init_from_ptl_ckpt**: initialize with a multi-speaker FastPitch checkpoint\n", - "* **model.n_speakers=1**: the number of speakers in the data. There is only 1 for now, \n", "* **~model.speaker_encoder.lookup_module**: remove the pre-trained looked-up speaker embedding" ] }, { "cell_type": "code", "execution_count": null, - "id": "2845624e", + "id": "a6f1cc94", "metadata": {}, "outputs": [], "source": [ @@ -498,7 +497,6 @@ "pitch_std={pitch_std} \\\n", "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", "heteronyms_path={normdir}/heteronyms-052722 \\\n", - "model.n_speakers=1 \\\n", "~model.speaker_encoder.lookup_module \\\n", "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", @@ -532,7 +530,7 @@ { "cell_type": "code", "execution_count": null, - "id": "44cbeee8", + "id": "31286dca", "metadata": {}, "outputs": [], "source": [ @@ -544,7 +542,7 @@ }, { "cell_type": "markdown", - "id": "b477be0c", + "id": "06e8f037", "metadata": {}, "source": [ "# 4. Fine-tune HiFiGAN on adaptation data" @@ -552,7 +550,7 @@ }, { "cell_type": "markdown", - "id": "6f690dbc", + "id": "40026019", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -561,7 +559,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bc2415f7", + "id": "3630a563", "metadata": {}, "outputs": [], "source": [ @@ -574,7 +572,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd8fb4b7", + "id": "ef6edacb", "metadata": {}, "outputs": [], "source": [ @@ -631,7 +629,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6e9a0d81", + "id": "a62bc5ff", "metadata": {}, "outputs": [], "source": [ @@ -640,8 +638,6 @@ "\n", "# Load Adapter Weights\n", "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", - "spec_model.freeze()\n", - "spec_model.unfreeze_enabled_adapters()\n", "spec_model.eval().cuda()\n", "\n", "beta_binomial_interpolator = BetaBinomialInterpolator()" @@ -650,7 +646,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd1b06e4", + "id": "c2a6e50d", "metadata": {}, "outputs": [], "source": [ @@ -684,7 +680,7 @@ }, { "cell_type": "markdown", - "id": "2ecf4794", + "id": "a68926d1", "metadata": {}, "source": [ "## b. Training" @@ -693,7 +689,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f599ff2b", + "id": "90ef58be", "metadata": {}, "outputs": [], "source": [ @@ -707,7 +703,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19de8999", + "id": "b7f8d543", "metadata": {}, "outputs": [], "source": [ @@ -737,7 +733,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d959a60f", + "id": "919a519d", "metadata": {}, "outputs": [], "source": [ @@ -749,7 +745,7 @@ }, { "cell_type": "markdown", - "id": "e476784f", + "id": "378fe2e3", "metadata": {}, "source": [ "# 3. Inference" @@ -758,7 +754,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6759111", + "id": "aabb1a66", "metadata": {}, "outputs": [], "source": [ @@ -769,7 +765,7 @@ }, { "cell_type": "markdown", - "id": "97736efb", + "id": "7274970d", "metadata": {}, "source": [ "## a. Load Model" @@ -778,7 +774,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a228df69", + "id": "4c32b949", "metadata": {}, "outputs": [], "source": [ @@ -788,22 +784,22 @@ { "cell_type": "code", "execution_count": null, - "id": "93ac4bdd", + "id": "75982012", "metadata": {}, "outputs": [], "source": [ "# FastPitch\n", "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", - "spec_model.freeze()\n", - "spec_model.unfreeze_enabled_adapters()\n", + "# spec_model.freeze()\n", + "# spec_model.unfreeze_enabled_adapters()\n", "spec_model = spec_model.eval().cuda()" ] }, { "cell_type": "code", "execution_count": null, - "id": "414c2710", + "id": "580fb6c6", "metadata": {}, "outputs": [], "source": [ @@ -813,7 +809,7 @@ }, { "cell_type": "markdown", - "id": "5cf1c315", + "id": "45533e1a", "metadata": {}, "source": [ "## b. Output Audio" @@ -822,7 +818,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ac94f171", + "id": "9e85db0f", "metadata": {}, "outputs": [], "source": [ @@ -855,7 +851,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cfe7c6b8", + "id": "78d23c8f", "metadata": {}, "outputs": [], "source": [ @@ -878,7 +874,7 @@ { "cell_type": "code", "execution_count": null, - "id": "375f77c9", + "id": "a8485d48", "metadata": {}, "outputs": [], "source": [ @@ -910,7 +906,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aa738cb2", + "id": "7c137592", "metadata": {}, "outputs": [], "source": [ @@ -920,7 +916,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c573b954", + "id": "081bee07", "metadata": {}, "outputs": [], "source": [ @@ -930,7 +926,7 @@ { "cell_type": "code", "execution_count": null, - "id": "805137d7", + "id": "997d2f41", "metadata": {}, "outputs": [], "source": [ @@ -940,7 +936,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b2038cf9", + "id": "0ac4b8d4", "metadata": {}, "outputs": [], "source": [] diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index 85df41e2979b..8bb4e27e5fcd 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "369c55f1", + "id": "13c04c94", "metadata": {}, "source": [ "# FastPitch MultiSpeaker Pretraining\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "a5f7fa32", + "id": "f0717c51", "metadata": {}, "source": [ "# License\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e7055e07", + "id": "136dd989", "metadata": {}, "outputs": [], "source": [ @@ -60,7 +60,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", - "BRANCH = 'tts_fastpitch_speaker_encoder'\n", + "BRANCH = 'main'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", @@ -70,7 +70,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a71b9226", + "id": "dd1f0fe2", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +80,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ad7daf8", + "id": "6364adf3", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +104,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e32d8df7", + "id": "633b93a7", "metadata": {}, "outputs": [], "source": [ @@ -120,7 +120,7 @@ }, { "cell_type": "markdown", - "id": "74f6a27e", + "id": "75c95297", "metadata": {}, "source": [ "# 1. Pre-train FastPitch on multi-speaker data" @@ -128,7 +128,7 @@ }, { "cell_type": "markdown", - "id": "92b28f3f", + "id": "312d1b49", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fcdec070", + "id": "394a25ed", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d313cda", + "id": "17cf9dae", "metadata": {}, "outputs": [], "source": [ @@ -159,7 +159,7 @@ }, { "cell_type": "markdown", - "id": "2466d86f", + "id": "f49d301f", "metadata": {}, "source": [ "For simplicity, we use original dev set as training set and original test set as validation set." @@ -168,7 +168,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ccb4c7c5", + "id": "f4711864", "metadata": {}, "outputs": [], "source": [ @@ -178,7 +178,7 @@ }, { "cell_type": "markdown", - "id": "262e7ef5", + "id": "342b056f", "metadata": {}, "source": [ "## b. Preprocessing" @@ -187,7 +187,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7bf29c9c", + "id": "bd08c5d8", "metadata": {}, "outputs": [], "source": [ @@ -199,7 +199,7 @@ }, { "cell_type": "markdown", - "id": "fd260436", + "id": "be281bd0", "metadata": {}, "source": [ "### Add absoluate audio path in manifest" @@ -208,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "08c0c87d", + "id": "f7f4b8a8", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32b2a36e", + "id": "eaa83d5a", "metadata": {}, "outputs": [], "source": [ @@ -241,7 +241,7 @@ }, { "cell_type": "markdown", - "id": "2417ad44", + "id": "954dc35b", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0" @@ -250,7 +250,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ad3d5037", + "id": "12b0d1ef", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ }, { "cell_type": "markdown", - "id": "7a2e96db", + "id": "1bc147e9", "metadata": {}, "source": [ "### Calculate Pitch Stats" @@ -275,7 +275,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6eb311c5", + "id": "002bb013", "metadata": {}, "outputs": [], "source": [ @@ -287,7 +287,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21777c97", + "id": "3aa5d471", "metadata": {}, "outputs": [], "source": [ @@ -320,7 +320,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b2a6f19", + "id": "d1b12283", "metadata": {}, "outputs": [], "source": [ @@ -348,7 +348,7 @@ }, { "cell_type": "markdown", - "id": "32425bca", + "id": "a33692c5", "metadata": {}, "source": [ "## c. Training" @@ -357,7 +357,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd3471cd", + "id": "022540fb", "metadata": {}, "outputs": [], "source": [ @@ -367,7 +367,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c71051e", + "id": "5b05c1ce", "metadata": {}, "outputs": [], "source": [ @@ -376,14 +376,14 @@ }, { "cell_type": "markdown", - "id": "792c0ba8", + "id": "33ce089f", "metadata": {}, "source": [ "### Important notes\n", - "* [Input] **speaker_id** in **sub_data_types**: each data has an unique speaker index (start from 0) in the input.\n", - "* [Input] **reference_audio** in **sub_data_types**: each data has a reference audio (from the same speaker) in the input.\n", - "* [Speaker] **model.n_speakers**: model gets the speaker size. \n", - "* [Speaker] **model.speaker_encoder.lookup_module**: model creates lookup table to extract speaker information from speaker id.\n", + "* [Data] **speaker_id** in **sup_data_types**: each data has an unique speaker index (start from 0) in the input.\n", + "* [Data] **reference_audio** in **sup_data_types**: each data has a reference audio (from the same speaker) in the input.\n", + "* [Speaker] **model.speaker_encoder.lookup_module**: model creates lookup table to get speaker embedding from speaker id.\n", + "* [Speaker] **model.speaker_encoder.lookup_module.n_speakers**: model gets the speaker size. \n", "* [Speaker] **model.speaker_encoder.gst_module**: model creates global style token to extract speaker information from reference audio.\n", "* [Condition] **condition_types=\"['add', 'layernorm']\"**: insert conditions with `add` operation to inputs and `layernorm` operation to layernorms." ] @@ -391,11 +391,13 @@ { "cell_type": "code", "execution_count": null, - "id": "34f50e59", + "id": "40e890fd", "metadata": {}, "outputs": [], "source": [ "# Normally 200 epochs\n", + "# `dataset.trim=True` only used for VCTK\n", + "# `dataset.trim_top_db=20` only used for VCTK\n", "!(python {codedir}/fastpitch.py \\\n", "--config-path={os.path.abspath(confdir)} \\\n", "--config-name=fastpitch_align_44100_adapter.yaml \\\n", @@ -409,8 +411,8 @@ "pitch_std={pitch_std} \\\n", "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", "heteronyms_path={normdir}/heteronyms-052722 \\\n", - "model.n_speakers=5 \\\n", - "model.speaker_encoder.lookup_module._target_=\"nemo.collections.tts.modules.submodules.LookupTable\" \\\n", + "model.speaker_encoder.lookup_module._target_=\"nemo.collections.tts.modules.submodules.SpeakerLookupTable\" \\\n", + "model.speaker_encoder.lookup_module.n_speakers=5 \\\n", "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", "model.output_fft.condition_types=\"['add', 'layernorm']\" \\\n", @@ -425,6 +427,10 @@ "model.validation_ds.dataset.max_duration=20 \\\n", "model.validation_ds.dataset.min_duration=0.1 \\\n", "+model.text_tokenizer.add_blank_at=True \\\n", + "model.train_ds.dataset.trim=True \\\n", + "model.validation_ds.dataset.trim=True \\\n", + "+model.train_ds.dataset.trim_top_db=20 \\\n", + "+model.validation_ds.dataset.trim_top_db=20 \\\n", "exp_manager.exp_dir={logsdir} \\\n", "+exp_manager.create_wandb_logger=True \\\n", "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-pretrain-multispeaker\" \\\n", @@ -441,7 +447,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c0a115ba", + "id": "594a12c7", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +459,7 @@ }, { "cell_type": "markdown", - "id": "b4c430d1", + "id": "21760d8c", "metadata": {}, "source": [ "# 2. Fine-tune HiFiGAN on multi-speaker data" @@ -461,7 +467,7 @@ }, { "cell_type": "markdown", - "id": "a12d558c", + "id": "059a5c57", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -470,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b49320ea", + "id": "f21ae142", "metadata": {}, "outputs": [], "source": [ @@ -483,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a503861d", + "id": "a0c916de", "metadata": {}, "outputs": [], "source": [ @@ -541,7 +547,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f947676c", + "id": "141303a7", "metadata": {}, "outputs": [], "source": [ @@ -552,7 +558,7 @@ { "cell_type": "code", "execution_count": null, - "id": "146a5027", + "id": "4ea69569", "metadata": {}, "outputs": [], "source": [ @@ -588,7 +594,7 @@ }, { "cell_type": "markdown", - "id": "59830d4b", + "id": "fceb5623", "metadata": {}, "source": [ "## b. Training" @@ -597,7 +603,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3586b593", + "id": "d0ca65ec", "metadata": {}, "outputs": [], "source": [ @@ -611,7 +617,7 @@ { "cell_type": "code", "execution_count": null, - "id": "83ff072a", + "id": "593513f2", "metadata": {}, "outputs": [], "source": [ @@ -641,7 +647,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f999e8c", + "id": "84fed050", "metadata": {}, "outputs": [], "source": [ @@ -653,7 +659,7 @@ }, { "cell_type": "markdown", - "id": "b8da2e2b", + "id": "f94cd841", "metadata": {}, "source": [ "# 3. Inference" @@ -662,7 +668,7 @@ { "cell_type": "code", "execution_count": null, - "id": "495e095f", + "id": "2807c212", "metadata": {}, "outputs": [], "source": [ @@ -673,7 +679,7 @@ }, { "cell_type": "markdown", - "id": "9254244e", + "id": "8dbac3ae", "metadata": {}, "source": [ "## a. Load Model" @@ -682,7 +688,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52ff6e88", + "id": "3e42dfe9", "metadata": {}, "outputs": [], "source": [ @@ -692,7 +698,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a09cefcd", + "id": "418c198e", "metadata": {}, "outputs": [], "source": [ @@ -703,7 +709,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59277b2b", + "id": "fca84cda", "metadata": {}, "outputs": [], "source": [ @@ -713,7 +719,7 @@ }, { "cell_type": "markdown", - "id": "df5eae8c", + "id": "1ef482fe", "metadata": {}, "source": [ "## b. Output Audio" @@ -722,7 +728,7 @@ { "cell_type": "code", "execution_count": null, - "id": "234aea6d", + "id": "b646d311", "metadata": {}, "outputs": [], "source": [ @@ -757,7 +763,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf0be104", + "id": "e5541924", "metadata": {}, "outputs": [], "source": [ @@ -783,7 +789,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e3328e3", + "id": "d76870d7", "metadata": {}, "outputs": [], "source": [ @@ -818,7 +824,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5c7ca983", + "id": "735bbbf9", "metadata": {}, "outputs": [], "source": [ @@ -828,7 +834,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c158077", + "id": "c7111d2a", "metadata": {}, "outputs": [], "source": [ @@ -838,7 +844,7 @@ { "cell_type": "code", "execution_count": null, - "id": "685905cd", + "id": "1be132a3", "metadata": {}, "outputs": [], "source": [] From d7567f0cdd2e361d811f8712874221463e4989ab Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Fri, 14 Apr 2023 16:29:11 -0700 Subject: [PATCH 03/25] Add tts adapter tutorial Signed-off-by: hsiehjackson --- .../tts/FastPitch_Adapter_Finetuning.ipynb | 970 ++++++++++++++++++ .../FastPitch_MultiSpeaker_Pretraining.ipynb | 868 ++++++++++++++++ 2 files changed, 1838 insertions(+) create mode 100644 tutorials/tts/FastPitch_Adapter_Finetuning.ipynb create mode 100644 tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb new file mode 100644 index 000000000000..1c9a377911af --- /dev/null +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -0,0 +1,970 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bceec759", + "metadata": {}, + "source": [ + "# FastPitch Adapter Finetuning\n", + "\n", + "This notebook is designed to provide a guide on how to run FastPitch Adapter Finetuning Pipeline. It contains the following sections:\n", + "1. **Transform pre-trained FastPitch checkpoint to adapter-compatible checkpoint**\n", + "2. **Fine-tune FastPitch on adaptation data**: fine-tune pre-trained multi-speaker FastPitch for a new speaker\n", + "* Dataset Preparation: download dataset and extract manifest files. (duration more than 15 mins)\n", + "* Preprocessing: add absolute audio paths in manifest, calculate pitch stats.\n", + "* Training: fine-tune frozen multispeaker FastPitch with trainable adapters.\n", + "3. **Fine-tune HiFiGAN on adaptation data**: fine-tune a vocoder for the fine-tuned multi-speaker FastPitch\n", + "* Dataset Preparation: extract mel-spectrograms from fine-tuned FastPitch.\n", + "* Training: fine-tune HiFiGAN with fine-tuned adaptation data.\n", + "4. **Inference**: generate speech from adpated FastPitch\n", + "* Load Model: load pre-trained multi-speaker FastPitch with fine-tuned adapters.\n", + "* Output Audio: generate audio files." + ] + }, + { + "cell_type": "markdown", + "id": "9363d17a", + "metadata": {}, + "source": [ + "# License\n", + "\n", + "> Copyright 2023 NVIDIA. All Rights Reserved.\n", + "> \n", + "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "> you may not use this file except in compliance with the License.\n", + "> You may obtain a copy of the License at\n", + "> \n", + "> http://www.apache.org/licenses/LICENSE-2.0\n", + "> \n", + "> Unless required by applicable law or agreed to in writing, software\n", + "> distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "> See the License for the specific language governing permissions and\n", + "> limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f952558a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can either run this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies# .\n", + "\"\"\"\n", + "BRANCH = 'tts_fastpitch_speaker_encoder'\n", + "# # If you're using Colab and not running locally, uncomment and run this cell.\n", + "# !apt-get install sox libsndfile1 ffmpeg\n", + "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5c4698e", + "metadata": {}, + "outputs": [], + "source": [ + "!wandb login #PASTE_WANDB_APIKEY_HERE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65b37d86", + "metadata": {}, + "outputs": [], + "source": [ + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"\"\n", + "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d5ec43c", + "metadata": {}, + "outputs": [], + "source": [ + "sample_rate = 44100\n", + "# Store all python script\n", + "codedir = 'NeMoTTS' \n", + "# Store all manifest and audios\n", + "datadir = 'NeMoTTS_dataset'\n", + "# Store all related text-normalized files\n", + "normdir = 'NeMoTTS_normalize_files'\n", + "# Store all supplementary files\n", + "suppdir = \"NeMoTTS_sup_data\"\n", + "# Store all config files\n", + "confdir = \"NeMoTTS_conf\"\n", + "# Store all training logs\n", + "logsdir = \"NeMoTTS_logs\"\n", + "# Store all mel-spectrograms for vocoder training\n", + "melsdir = \"NeMoTTS_mels\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a235bc58", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import shutil\n", + "import nemo\n", + "import torch\n", + "import numpy as np\n", + "\n", + "from pathlib import Path\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "id": "7117eebe", + "metadata": {}, + "source": [ + "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fe5e9da", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.core import adapter_mixins\n", + "from omegaconf import DictConfig, OmegaConf, open_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17a3892c", + "metadata": {}, + "outputs": [], + "source": [ + "def update_model_config_to_support_adapter(config) -> DictConfig:\n", + " with open_dict(config):\n", + " enc_adapter_metadata = adapter_mixins.get_registered_adapter(config.input_fft._target_)\n", + " if enc_adapter_metadata is not None:\n", + " config.input_fft._target_ = enc_adapter_metadata.adapter_class_path\n", + "\n", + " dec_adapter_metadata = adapter_mixins.get_registered_adapter(config.output_fft._target_)\n", + " if dec_adapter_metadata is not None:\n", + " config.output_fft._target_ = dec_adapter_metadata.adapter_class_path\n", + "\n", + " pitch_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.pitch_predictor._target_)\n", + " if pitch_predictor_adapter_metadata is not None:\n", + " config.pitch_predictor._target_ = pitch_predictor_adapter_metadata.adapter_class_path\n", + "\n", + " duration_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.duration_predictor._target_)\n", + " if duration_predictor_adapter_metadata is not None:\n", + " config.duration_predictor._target_ = duration_predictor_adapter_metadata.adapter_class_path\n", + "\n", + " aligner_adapter_metadata = adapter_mixins.get_registered_adapter(config.alignment_module._target_)\n", + " if aligner_adapter_metadata is not None:\n", + " config.alignment_module._target_ = aligner_adapter_metadata.adapter_class_path\n", + "\n", + " return config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f5390d2", + "metadata": {}, + "outputs": [], + "source": [ + "state = torch.load(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", + "state['hyper_parameters']['cfg'] = update_model_config_to_support_adapter(state['hyper_parameters']['cfg'])\n", + "torch.save(state, YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e07ac1c0", + "metadata": {}, + "outputs": [], + "source": [ + "shutil.copyfile(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT, \"FastPitch.pt\")\n", + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"FastPitch.pt\"\n", + "shutil.copyfile(YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT, \"HifiGan.pt\")\n", + "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = \"HifiGan.pt\"" + ] + }, + { + "cell_type": "markdown", + "id": "f32e7bb1", + "metadata": {}, + "source": [ + "# 2. Fine-tune FastPitch on adaptation data" + ] + }, + { + "cell_type": "markdown", + "id": "7d45a5d4", + "metadata": {}, + "source": [ + "## a. Data Preparation\n", + "For our tutorial, we use small part of VCTK dataset with a new target speaker (p267). Usually, the audios should have total duration more than 15 mintues." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9799debe", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {datadir} && cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset.tar.gz && tar zxf vctk_subset.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aceea64f", + "metadata": {}, + "outputs": [], + "source": [ + "manidir = f\"{datadir}/vctk_subset\"\n", + "!ls {manidir}" + ] + }, + { + "cell_type": "markdown", + "id": "6d64cb74", + "metadata": {}, + "source": [ + "For simplicity, we use original dev set as training set and original test set as validation set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36ad65ae", + "metadata": {}, + "outputs": [], + "source": [ + "train_manifest = os.path.abspath(os.path.join(manidir, 'train.json'))\n", + "valid_manifest = os.path.abspath(os.path.join(manidir, 'dev.json'))" + ] + }, + { + "cell_type": "markdown", + "id": "2cdcd15d", + "metadata": {}, + "source": [ + "## b. Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f95711cc", + "metadata": {}, + "outputs": [], + "source": [ + "# additional files\n", + "!mkdir -p {normdir} && cd {normdir} \\\n", + "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", + "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-052722 \\" + ] + }, + { + "cell_type": "markdown", + "id": "34452374", + "metadata": {}, + "source": [ + "### Add absolute file path in manifest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d89c1712", + "metadata": {}, + "outputs": [], + "source": [ + "def json_reader(filename):\n", + " lines = []\n", + " with open(filename) as f:\n", + " for line in f: lines.append(json.loads(line))\n", + " return lines\n", + "\n", + "def json_writer(manifest, filename):\n", + " with open(filename, 'w') as fout:\n", + " for m in manifest: fout.write(json.dumps(m) + '\\n') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a713d486", + "metadata": {}, + "outputs": [], + "source": [ + "train_datas = json_reader(train_manifest)\n", + "for m in train_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "json_writer(train_datas, train_manifest)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2021c507", + "metadata": {}, + "outputs": [], + "source": [ + "valid_datas = json_reader(valid_manifest)\n", + "for m in valid_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "3ae42b30", + "metadata": {}, + "source": [ + "### Calibrate speaker id to start from 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce86f9c5", + "metadata": {}, + "outputs": [], + "source": [ + "train_datas = json_reader(train_manifest)\n", + "for m in train_datas: m['old_speaker'], m['speaker'] = m['speaker'], 0\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "valid_datas = json_reader(valid_manifest)\n", + "for m in valid_datas: m['old_speaker'], m['speaker'] = m['speaker'], 0\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "560f0f94", + "metadata": {}, + "source": [ + "### Calculate Pitch Stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "108be984", + "metadata": {}, + "outputs": [], + "source": [ + "import librosa\n", + "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", + "from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd69c623", + "metadata": {}, + "outputs": [], + "source": [ + "def get_pitch(sample): \n", + " rel_audio_path = Path(sample[\"audio_filepath\"]).relative_to(base_data_dir).with_suffix(\"\")\n", + " rel_audio_path_as_text_id = str(rel_audio_path).replace(\"/\", \"_\")\n", + " pitch_filepath = os.path.join(pitch_dir, f\"{rel_audio_path_as_text_id}.pt\")\n", + " \n", + " if os.path.exists(pitch_filepath):\n", + " pitch = torch.load(pitch_filepath).numpy()\n", + "\n", + " else:\n", + " features = wave_model.process(\n", + " sample[\"audio_filepath\"]\n", + " )\n", + " voiced_tuple = librosa.pyin(\n", + " features.numpy(),\n", + " fmin=librosa.note_to_hz('C2'),\n", + " fmax=librosa.note_to_hz('C7'),\n", + " frame_length=2048,\n", + " sr=44100,\n", + " fill_na=0.0,\n", + " )\n", + " pitch = voiced_tuple[0]\n", + " torch.save(torch.from_numpy(pitch).float(), pitch_filepath)\n", + " \n", + " return pitch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc620c67", + "metadata": {}, + "outputs": [], + "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", + "pitch_dir = os.path.join(suppdir, 'pitch')\n", + "os.makedirs(suppdir, exist_ok=True)\n", + "os.makedirs(pitch_dir, exist_ok=True)\n", + "\n", + "train_pitchs = []\n", + "train_datas = json_reader(train_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", + "for m in tqdm(train_datas): train_pitchs.append(get_pitch(m))\n", + " \n", + "valid_datas = json_reader(valid_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", + "for m in tqdm(valid_datas): get_pitch(m)\n", + "\n", + "train_pitchs = np.concatenate(train_pitchs)\n", + "pitch_mean = float(np.mean(train_pitchs))\n", + "pitch_std = float(np.std(train_pitchs))\n", + "\n", + "with open(os.path.join(manidir, 'pitch_stats.json'), 'w') as f:\n", + " json.dump({'pitch':[pitch_mean, pitch_std]}, f)" + ] + }, + { + "cell_type": "markdown", + "id": "c8eb5b1d", + "metadata": {}, + "source": [ + "## c. Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e3b0d80", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {confdir} && cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/fastpitch_align_44100_adapter.yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d26d2c2b", + "metadata": {}, + "outputs": [], + "source": [ + "!cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/fastpitch_finetune_adapters.py" + ] + }, + { + "cell_type": "markdown", + "id": "8d21f6b5", + "metadata": {}, + "source": [ + "### Important notes\n", + "* **+init_from_ptl_ckpt**: initialize with a multi-speaker FastPitch checkpoint\n", + "* **model.n_speakers=1**: the number of speakers in the data. There is only 1 for now, \n", + "* **~model.speaker_encoder.lookup_module**: remove the pre-trained looked-up speaker embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2845624e", + "metadata": {}, + "outputs": [], + "source": [ + "# Normally 100 epochs (15 mins)\n", + "!(python {codedir}/fastpitch_finetune_adapters.py \\\n", + "--config-path={os.path.abspath(confdir)} \\\n", + "--config-name=fastpitch_align_44100_adapter.yaml \\\n", + "+init_from_ptl_ckpt={YOUR_PRETRAINED_FASTPITCH_CHECKPOINT} \\\n", + "sample_rate=44100 \\\n", + "train_dataset={train_manifest} \\\n", + "validation_datasets={valid_manifest} \\\n", + "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id', 'reference_audio']\" \\\n", + "sup_data_path={suppdir} \\\n", + "pitch_mean={pitch_mean} \\\n", + "pitch_std={pitch_std} \\\n", + "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", + "heteronyms_path={normdir}/heteronyms-052722 \\\n", + "model.n_speakers=1 \\\n", + "~model.speaker_encoder.lookup_module \\\n", + "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", + "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", + "model.output_fft.condition_types=\"['add', 'layernorm']\" \\\n", + "model.duration_predictor.condition_types=\"['add', 'layernorm']\" \\\n", + "model.pitch_predictor.condition_types=\"['add', 'layernorm']\" \\\n", + "model.alignment_module.condition_types=\"['add']\" \\\n", + "model.train_ds.dataloader_params.batch_size=8 \\\n", + "model.validation_ds.dataloader_params.batch_size=8 \\\n", + "model.train_ds.dataloader_params.num_workers=8 \\\n", + "model.validation_ds.dataloader_params.num_workers=8 \\\n", + "+model.text_tokenizer.add_blank_at=True \\\n", + "model.optim.name=adam \\\n", + "model.optim.lr=2e-4 \\\n", + "model.optim.weight_decay=0.0 \\\n", + "~model.optim.sched \\\n", + "exp_manager.exp_dir={logsdir} \\\n", + "+exp_manager.create_wandb_logger=True \\\n", + "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-finetune-adaptation\" \\\n", + "+exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", + "+exp_manager.checkpoint_callback_params.save_top_k=-1 \\\n", + "trainer.max_epochs=10 \\\n", + "trainer.check_val_every_n_epoch=10 \\\n", + "trainer.log_every_n_steps=1 \\\n", + "trainer.devices=1 \\\n", + "trainer.strategy=ddp \\\n", + "trainer.precision=32 \\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44cbeee8", + "metadata": {}, + "outputs": [], + "source": [ + "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "YOUR_FINETUNED_ADAPTER_CHECKPOINT = list(last_checkpoint_dir.glob('adapters.pt'))[0]\n", + "YOUR_FINETUNED_ADAPTER_CHECKPOINT" + ] + }, + { + "cell_type": "markdown", + "id": "b477be0c", + "metadata": {}, + "source": [ + "# 4. Fine-tune HiFiGAN on adaptation data" + ] + }, + { + "cell_type": "markdown", + "id": "6f690dbc", + "metadata": {}, + "source": [ + "## a. Dataset Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc2415f7", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator\n", + "from nemo.collections.tts.models import FastPitchModel\n", + "from collections import defaultdict\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd8fb4b7", + "metadata": {}, + "outputs": [], + "source": [ + "def gen_spectrogram(index, manifest, speaker_to_index, base_data_dir):\n", + " \n", + " record = manifest[index]\n", + " audio_file = record[\"audio_filepath\"]\n", + " \n", + " if '.wav' in audio_file:\n", + " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".wav\", \".npy\")))\n", + " \n", + " if '.flac' in audio_file:\n", + " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".flac\", \".npy\")))\n", + " \n", + " if os.path.exists(save_path):\n", + " return save_path\n", + " \n", + " if \"normalized_text\" in record:\n", + " text = spec_model.parse(record[\"normalized_text\"], normalize=False)\n", + " else:\n", + " text = spec_model.parse(record['text'])\n", + " \n", + " text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=spec_model.device).unsqueeze(0)\n", + " \n", + " audio = wave_model.process(audio_file).unsqueeze(0).to(device=spec_model.device)\n", + " audio_len = torch.tensor(audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", + " spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len) \n", + " \n", + " attn_prior = torch.from_numpy(beta_binomial_interpolator(spect_len.item(), text_len.item())).unsqueeze(0).to(spec_model.device)\n", + " \n", + " reference_pool = speaker_to_index[record[\"speaker\"]] - set([index]) if len(speaker_to_index[record[\"speaker\"]]) > 1 else speaker_to_index[record[\"speaker\"]]\n", + " reference_sample = manifest[random.sample(reference_pool, 1)[0]]\n", + " reference_audio = wave_model.process(reference_sample[\"audio_filepath\"]).unsqueeze(0).to(device=spec_model.device)\n", + " reference_audio_length = torch.tensor(reference_audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", + " reference_spec, reference_spec_len = spec_model.preprocessor(input_signal=reference_audio, length=reference_audio_length) \n", + " \n", + " \n", + " with torch.no_grad():\n", + " spectrogram = spec_model.forward(\n", + " text=text, \n", + " input_lens=text_len,\n", + " spec=spect, \n", + " mel_lens=spect_len, \n", + " attn_prior=attn_prior,\n", + " reference_spec=reference_spec,\n", + " reference_spec_lens=reference_spec_len,\n", + " )[0]\n", + " \n", + " spec = spectrogram[0].to('cpu').numpy()\n", + " np.save(save_path, spec)\n", + " return save_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e9a0d81", + "metadata": {}, + "outputs": [], + "source": [ + "# Pretrained FastPitch Weights\n", + "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", + "\n", + "# Load Adapter Weights\n", + "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", + "spec_model.freeze()\n", + "spec_model.unfreeze_enabled_adapters()\n", + "spec_model.eval().cuda()\n", + "\n", + "beta_binomial_interpolator = BetaBinomialInterpolator()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd1b06e4", + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(melsdir, exist_ok=True)\n", + "\n", + "# Train\n", + "train_datas = json_reader(train_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(train_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", + "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", + "\n", + "for i, record in enumerate(tqdm(train_datas)):\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, train_datas, speaker_to_index, base_data_dir)\n", + "\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "\n", + "# Valid\n", + "valid_datas = json_reader(valid_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(valid_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", + "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", + "\n", + "for i, record in enumerate(tqdm(valid_datas)):\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, valid_datas, speaker_to_index, base_data_dir)\n", + "\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "2ecf4794", + "metadata": {}, + "source": [ + "## b. Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f599ff2b", + "metadata": {}, + "outputs": [], + "source": [ + "!cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/hifigan/hifigan_44100.yaml\n", + "!cd {confdir} && mkdir -p model/train_ds && cd model/train_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml \n", + "!cd {confdir} && mkdir -p model/validation_ds && cd model/validation_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/validation_ds/val_ds_finetune.yaml\n", + "!cd {confdir} && mkdir -p model/generator && cd model/generator && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/generator/v1_44100.yaml\n", + "!cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/hifigan_finetune.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19de8999", + "metadata": {}, + "outputs": [], + "source": [ + "# Normally 500 epochs (30 mins)\n", + "!(python {codedir}/hifigan_finetune.py \\\n", + "--config-path={os.path.abspath(confdir)} \\\n", + "--config-name=hifigan_44100.yaml \\\n", + "train_dataset={train_manifest} \\\n", + "validation_datasets={valid_manifest} \\\n", + "+init_from_ptl_ckpt={YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT} \\\n", + "model.train_ds.dataloader_params.batch_size=32 \\\n", + "model.optim.lr=0.0001 \\\n", + "+trainer.max_epochs=5 \\\n", + "trainer.check_val_every_n_epoch=5 \\\n", + "model/train_ds=train_ds_finetune \\\n", + "model/validation_ds=val_ds_finetune \\\n", + "trainer.devices=1 \\\n", + "trainer.strategy='ddp' \\\n", + "trainer.precision=16 \\\n", + "exp_manager.exp_dir={logsdir} \\\n", + "exp_manager.create_wandb_logger=True \\\n", + "exp_manager.wandb_logger_kwargs.name=\"tutorial-HiFiGAN-finetune-adaptation\" \\\n", + "exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d959a60f", + "metadata": {}, + "outputs": [], + "source": [ + "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX.ckpt\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", + "YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT" + ] + }, + { + "cell_type": "markdown", + "id": "e476784f", + "metadata": {}, + "source": [ + "# 3. Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6759111", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.tts.models import HifiGanModel\n", + "import IPython.display as ipd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "97736efb", + "metadata": {}, + "source": [ + "## a. Load Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a228df69", + "metadata": {}, + "outputs": [], + "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93ac4bdd", + "metadata": {}, + "outputs": [], + "source": [ + "# FastPitch\n", + "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", + "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", + "spec_model.freeze()\n", + "spec_model.unfreeze_enabled_adapters()\n", + "spec_model = spec_model.eval().cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "414c2710", + "metadata": {}, + "outputs": [], + "source": [ + "# HiFiGAN\n", + "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT).eval().cuda()" + ] + }, + { + "cell_type": "markdown", + "id": "5cf1c315", + "metadata": {}, + "source": [ + "## b. Output Audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac94f171", + "metadata": {}, + "outputs": [], + "source": [ + "def gt_spectrogram(audio_path, wave_model, spec_gen_model):\n", + " features = wave_model.process(audio_path, trim=False)\n", + " audio, audio_length = features, torch.tensor(features.shape[0]).long()\n", + " audio = audio.unsqueeze(0).to(device=spec_gen_model.device)\n", + " audio_length = audio_length.unsqueeze(0).to(device=spec_gen_model.device)\n", + " with torch.no_grad():\n", + " spectrogram, spec_len = spec_gen_model.preprocessor(input_signal=audio, length=audio_length)\n", + " return spectrogram, spec_len\n", + "\n", + "def gen_spectrogram(text, spec_gen_model, reference_spec, reference_spec_lens):\n", + " parsed = spec_gen_model.parse(text)\n", + " with torch.no_grad(): \n", + " spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, \n", + " reference_spec=reference_spec, \n", + " reference_spec_lens=reference_spec_lens)\n", + "\n", + " return spectrogram\n", + " \n", + "def synth_audio(vocoder_model, spectrogram): \n", + " with torch.no_grad(): \n", + " audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)\n", + " if isinstance(audio, torch.Tensor):\n", + " audio = audio.to('cpu').numpy()\n", + " return audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfe7c6b8", + "metadata": {}, + "outputs": [], + "source": [ + "# Reference Audio\n", + "with open(train_manifest, \"r\") as f:\n", + " for i, line in enumerate(f):\n", + " reference_record = json.loads(line)\n", + " break\n", + " \n", + "# Validatation Audio\n", + "num_val = 3\n", + "val_records = []\n", + "with open(valid_manifest, \"r\") as f:\n", + " for i, line in enumerate(f):\n", + " val_records.append(json.loads(line))\n", + " if len(val_records) >= num_val:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "375f77c9", + "metadata": {}, + "outputs": [], + "source": [ + "for i, val_record in enumerate(val_records):\n", + " reference_spec, reference_spec_lens = gt_spectrogram(reference_record['audio_filepath'], wave_model, spec_model)\n", + " reference_spec = reference_spec.to(spec_model.device)\n", + " spec_pred = gen_spectrogram(val_record['text'], spec_model,\n", + " reference_spec=reference_spec, \n", + " reference_spec_lens=reference_spec_lens)\n", + "\n", + " audio_gen = synth_audio(vocoder_model, spec_pred)\n", + " \n", + " audio_ref = ipd.Audio(reference_record['audio_filepath'], rate=sample_rate)\n", + " audio_gt = ipd.Audio(val_record['audio_filepath'], rate=sample_rate)\n", + " audio_gen = ipd.Audio(audio_gen, rate=sample_rate)\n", + " \n", + " print(\"------\")\n", + " print(f\"Text: {val_record['text']}\")\n", + " print('Reference Audio')\n", + " ipd.display(audio_ref)\n", + " print('Ground Truth Audio')\n", + " ipd.display(audio_gt)\n", + " print('Synthesized Audio')\n", + " ipd.display(audio_gen)\n", + " plt.imshow(spec_pred[0].to('cpu').numpy(), origin=\"lower\", aspect=\"auto\")\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa738cb2", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c573b954", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_FINETUNED_ADAPTER_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "805137d7", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2038cf9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb new file mode 100644 index 000000000000..85df41e2979b --- /dev/null +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -0,0 +1,868 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "369c55f1", + "metadata": {}, + "source": [ + "# FastPitch MultiSpeaker Pretraining\n", + "\n", + "This notebook is designed to provide a guide on how to run FastPitch MultiSpeaker Pretraining Pipeline. It contains the following sections:\n", + "1. **Pre-train FastPitch on multi-speaker data**: pre-train a multi-speaker FastPitch\n", + "* Dataset Preparation: download dataset and extract manifest files.\n", + "* Preprocessing: add absolute audio paths in manifest, calculate pitch stats.\n", + "* Training: pre-train multispeaker FastPitch\n", + " * Input: we introduce additional speaker id and reference audio.\n", + " * Speaker: we have looked-up speaker embedding and speaker encoder. \n", + " * Condition: we can condition pitch/duration predictors, mel-spectrogram decoder, aligner, and layernorm layers.\n", + "2. **Fine-tune HiFiGAN on multi-speaker data**: fine-tune a vocoder for the pre-trained multi-speaker FastPitch\n", + "* Dataset Preparation: extract mel-spectrograms from pre-trained FastPitch.\n", + "* Training: fine-tune HiFiGAN with pre-trained multi-speaker data.\n", + "3. **Inference**: generate speech from pre-trained multi-speaker FastPitch\n", + "* Load Model: load pre-trained multi-speaker FastPitch.\n", + "* Output Audio: generate audio files." + ] + }, + { + "cell_type": "markdown", + "id": "a5f7fa32", + "metadata": {}, + "source": [ + "# License\n", + "\n", + "> Copyright 2023 NVIDIA. All Rights Reserved.\n", + "> \n", + "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "> you may not use this file except in compliance with the License.\n", + "> You may obtain a copy of the License at\n", + "> \n", + "> http://www.apache.org/licenses/LICENSE-2.0\n", + "> \n", + "> Unless required by applicable law or agreed to in writing, software\n", + "> distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "> See the License for the specific language governing permissions and\n", + "> limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7055e07", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can either run this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies# .\n", + "\"\"\"\n", + "BRANCH = 'tts_fastpitch_speaker_encoder'\n", + "# # If you're using Colab and not running locally, uncomment and run this cell.\n", + "# !apt-get install sox libsndfile1 ffmpeg\n", + "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a71b9226", + "metadata": {}, + "outputs": [], + "source": [ + "!wandb login #PASTE_WANDB_APIKEY_HERE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ad7daf8", + "metadata": {}, + "outputs": [], + "source": [ + "sample_rate = 44100\n", + "# Store all python script\n", + "codedir = 'NeMoTTS' \n", + "# Store all manifest and audios\n", + "datadir = 'NeMoTTS_dataset'\n", + "# Store all related text-normalized files\n", + "normdir = 'NeMoTTS_normalize_files'\n", + "# Store all supplementary files\n", + "suppdir = \"NeMoTTS_sup_data\"\n", + "# Store all config files\n", + "confdir = \"NeMoTTS_conf\"\n", + "# Store all training logs\n", + "logsdir = \"NeMoTTS_logs\"\n", + "# Store all mel-spectrograms for vocoder training\n", + "melsdir = \"NeMoTTS_mels\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e32d8df7", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import nemo\n", + "import torch\n", + "import numpy as np\n", + "\n", + "from pathlib import Path\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "id": "74f6a27e", + "metadata": {}, + "source": [ + "# 1. Pre-train FastPitch on multi-speaker data" + ] + }, + { + "cell_type": "markdown", + "id": "92b28f3f", + "metadata": {}, + "source": [ + "## a. Dataset Preparation\n", + "For our tutorial, we use the subset of VCTK dataset with 5 speakers (p225-p229). The audios have 48 kHz sampling rate, we downsample to 44.1 kHz in this tutorial. \n", + "You can read more about dataset [here](https://datashare.ed.ac.uk/handle/10283/2950)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcdec070", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {datadir} && cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset_multispeaker.tar.gz && tar zxf vctk_subset_multispeaker.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d313cda", + "metadata": {}, + "outputs": [], + "source": [ + "manidir = f\"{datadir}/vctk_subset_multispeaker\"\n", + "!ls {manidir}" + ] + }, + { + "cell_type": "markdown", + "id": "2466d86f", + "metadata": {}, + "source": [ + "For simplicity, we use original dev set as training set and original test set as validation set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccb4c7c5", + "metadata": {}, + "outputs": [], + "source": [ + "train_manifest = os.path.abspath(os.path.join(manidir, 'train.json'))\n", + "valid_manifest = os.path.abspath(os.path.join(manidir, 'dev.json'))" + ] + }, + { + "cell_type": "markdown", + "id": "262e7ef5", + "metadata": {}, + "source": [ + "## b. Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bf29c9c", + "metadata": {}, + "outputs": [], + "source": [ + "# additional files\n", + "!mkdir -p {normdir} && cd {normdir} \\\n", + "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", + "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-052722 \\" + ] + }, + { + "cell_type": "markdown", + "id": "fd260436", + "metadata": {}, + "source": [ + "### Add absoluate audio path in manifest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08c0c87d", + "metadata": {}, + "outputs": [], + "source": [ + "def json_reader(filename):\n", + " lines = []\n", + " with open(filename) as f:\n", + " for line in f: lines.append(json.loads(line))\n", + " return lines\n", + "\n", + "def json_writer(manifest, filename):\n", + " with open(filename, 'w') as fout:\n", + " for m in manifest: fout.write(json.dumps(m) + '\\n') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32b2a36e", + "metadata": {}, + "outputs": [], + "source": [ + "train_datas = json_reader(train_manifest)\n", + "for m in train_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "valid_datas = json_reader(valid_manifest)\n", + "for m in valid_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "2417ad44", + "metadata": {}, + "source": [ + "### Calibrate speaker id to start from 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad3d5037", + "metadata": {}, + "outputs": [], + "source": [ + "train_datas = json_reader(train_manifest)\n", + "speaker2id = {s: _id for _id, s in enumerate(set([m['speaker'] for m in train_datas]))}\n", + "for m in train_datas: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "valid_datas = json_reader(valid_manifest)\n", + "for m in valid_datas: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "7a2e96db", + "metadata": {}, + "source": [ + "### Calculate Pitch Stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eb311c5", + "metadata": {}, + "outputs": [], + "source": [ + "import librosa\n", + "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", + "from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21777c97", + "metadata": {}, + "outputs": [], + "source": [ + "def get_pitch(sample): \n", + " rel_audio_path = Path(sample[\"audio_filepath\"]).relative_to(base_data_dir).with_suffix(\"\")\n", + " rel_audio_path_as_text_id = str(rel_audio_path).replace(\"/\", \"_\")\n", + " pitch_filepath = os.path.join(pitch_dir, f\"{rel_audio_path_as_text_id}.pt\")\n", + " \n", + " if os.path.exists(pitch_filepath):\n", + " pitch = torch.load(pitch_filepath).numpy()\n", + "\n", + " else:\n", + " features = wave_model.process(\n", + " sample[\"audio_filepath\"]\n", + " )\n", + " voiced_tuple = librosa.pyin(\n", + " features.numpy(),\n", + " fmin=librosa.note_to_hz('C2'),\n", + " fmax=librosa.note_to_hz('C7'),\n", + " frame_length=2048,\n", + " sr=sample_rate,\n", + " fill_na=0.0,\n", + " )\n", + " pitch = voiced_tuple[0]\n", + " torch.save(torch.from_numpy(pitch).float(), pitch_filepath)\n", + " \n", + " return pitch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b2a6f19", + "metadata": {}, + "outputs": [], + "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", + "pitch_dir = os.path.join(suppdir, 'pitch')\n", + "os.makedirs(suppdir, exist_ok=True)\n", + "os.makedirs(pitch_dir, exist_ok=True)\n", + "\n", + "train_pitchs = []\n", + "train_datas = json_reader(train_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", + "for m in tqdm(train_datas): train_pitchs.append(get_pitch(m))\n", + " \n", + "valid_datas = json_reader(valid_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", + "for m in tqdm(valid_datas): get_pitch(m)\n", + "\n", + "train_pitchs = np.concatenate(train_pitchs)\n", + "pitch_mean = float(np.mean(train_pitchs))\n", + "pitch_std = float(np.std(train_pitchs))\n", + "\n", + "with open(os.path.join(manidir, 'pitch_stats.json'), 'w') as f:\n", + " json.dump({'pitch':[pitch_mean, pitch_std]}, f)" + ] + }, + { + "cell_type": "markdown", + "id": "32425bca", + "metadata": {}, + "source": [ + "## c. Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd3471cd", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {confdir} && cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/fastpitch_align_44100_adapter.yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c71051e", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p {codedir} && cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/fastpitch.py" + ] + }, + { + "cell_type": "markdown", + "id": "792c0ba8", + "metadata": {}, + "source": [ + "### Important notes\n", + "* [Input] **speaker_id** in **sub_data_types**: each data has an unique speaker index (start from 0) in the input.\n", + "* [Input] **reference_audio** in **sub_data_types**: each data has a reference audio (from the same speaker) in the input.\n", + "* [Speaker] **model.n_speakers**: model gets the speaker size. \n", + "* [Speaker] **model.speaker_encoder.lookup_module**: model creates lookup table to extract speaker information from speaker id.\n", + "* [Speaker] **model.speaker_encoder.gst_module**: model creates global style token to extract speaker information from reference audio.\n", + "* [Condition] **condition_types=\"['add', 'layernorm']\"**: insert conditions with `add` operation to inputs and `layernorm` operation to layernorms." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34f50e59", + "metadata": {}, + "outputs": [], + "source": [ + "# Normally 200 epochs\n", + "!(python {codedir}/fastpitch.py \\\n", + "--config-path={os.path.abspath(confdir)} \\\n", + "--config-name=fastpitch_align_44100_adapter.yaml \\\n", + "+init_from_pretrained_model=\"tts_en_fastpitch\" \\\n", + "sample_rate={sample_rate} \\\n", + "train_dataset={train_manifest} \\\n", + "validation_datasets={valid_manifest} \\\n", + "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id','reference_audio']\" \\\n", + "sup_data_path={suppdir} \\\n", + "pitch_mean={pitch_mean} \\\n", + "pitch_std={pitch_std} \\\n", + "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", + "heteronyms_path={normdir}/heteronyms-052722 \\\n", + "model.n_speakers=5 \\\n", + "model.speaker_encoder.lookup_module._target_=\"nemo.collections.tts.modules.submodules.LookupTable\" \\\n", + "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", + "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", + "model.output_fft.condition_types=\"['add', 'layernorm']\" \\\n", + "model.duration_predictor.condition_types=\"['add', 'layernorm']\" \\\n", + "model.pitch_predictor.condition_types=\"['add', 'layernorm']\" \\\n", + "model.alignment_module.condition_types=\"['add']\" \\\n", + "model.train_ds.dataloader_params.batch_size=8 \\\n", + "model.validation_ds.dataloader_params.batch_size=8 \\\n", + "model.train_ds.dataloader_params.num_workers=8 \\\n", + "model.validation_ds.dataloader_params.num_workers=8 \\\n", + "model.train_ds.dataset.max_duration=20 \\\n", + "model.validation_ds.dataset.max_duration=20 \\\n", + "model.validation_ds.dataset.min_duration=0.1 \\\n", + "+model.text_tokenizer.add_blank_at=True \\\n", + "exp_manager.exp_dir={logsdir} \\\n", + "+exp_manager.create_wandb_logger=True \\\n", + "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-pretrain-multispeaker\" \\\n", + "+exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", + "trainer.max_epochs=20 \\\n", + "trainer.check_val_every_n_epoch=20 \\\n", + "trainer.log_every_n_steps=1 \\\n", + "trainer.devices=-1 \\\n", + "trainer.strategy=ddp \\\n", + "trainer.precision=32 \\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0a115ba", + "metadata": {}, + "outputs": [], + "source": [ + "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch--val_loss=XXX-epoch=XXX-last.ckpt\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT" + ] + }, + { + "cell_type": "markdown", + "id": "b4c430d1", + "metadata": {}, + "source": [ + "# 2. Fine-tune HiFiGAN on multi-speaker data" + ] + }, + { + "cell_type": "markdown", + "id": "a12d558c", + "metadata": {}, + "source": [ + "## a. Dataset Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b49320ea", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator\n", + "from nemo.collections.tts.models import FastPitchModel\n", + "from collections import defaultdict\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a503861d", + "metadata": {}, + "outputs": [], + "source": [ + "def gen_spectrogram(index, manifest, speaker_to_index, base_data_dir):\n", + " \n", + " record = manifest[index]\n", + " audio_file = record[\"audio_filepath\"]\n", + " \n", + " if '.wav' in audio_file:\n", + " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".wav\", \".npy\")))\n", + " \n", + " if '.flac' in audio_file:\n", + " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".flac\", \".npy\")))\n", + " \n", + " if os.path.exists(save_path):\n", + " return save_path\n", + " \n", + " if \"normalized_text\" in record:\n", + " text = spec_model.parse(record[\"normalized_text\"], normalize=False)\n", + " else:\n", + " text = spec_model.parse(record['text'])\n", + " \n", + " text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=spec_model.device).unsqueeze(0)\n", + " \n", + " audio = wave_model.process(audio_file).unsqueeze(0).to(device=spec_model.device)\n", + " audio_len = torch.tensor(audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", + " spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len) \n", + " attn_prior = torch.from_numpy(beta_binomial_interpolator(spect_len.item(), text_len.item())).unsqueeze(0).to(spec_model.device)\n", + " \n", + " speaker = torch.tensor([record['speaker']]).to(spec_model.device)\n", + " \n", + " reference_pool = speaker_to_index[record[\"speaker\"]] - set([index]) if len(speaker_to_index[record[\"speaker\"]]) > 1 else speaker_to_index[record[\"speaker\"]]\n", + " reference_sample = manifest[random.sample(reference_pool, 1)[0]]\n", + " reference_audio = wave_model.process(reference_sample[\"audio_filepath\"]).unsqueeze(0).to(device=spec_model.device)\n", + " reference_audio_length = torch.tensor(reference_audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", + " reference_spec, reference_spec_len = spec_model.preprocessor(input_signal=reference_audio, length=reference_audio_length) \n", + " \n", + " with torch.no_grad():\n", + " spectrogram = spec_model.forward(\n", + " text=text, \n", + " input_lens=text_len,\n", + " spec=spect, \n", + " mel_lens=spect_len, \n", + " attn_prior=attn_prior,\n", + " speaker=speaker,\n", + " reference_spec=reference_spec,\n", + " reference_spec_lens=reference_spec_len\n", + " )[0]\n", + " \n", + " spec = spectrogram[0].to('cpu').numpy()\n", + " np.save(save_path, spec)\n", + " return save_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f947676c", + "metadata": {}, + "outputs": [], + "source": [ + "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT).eval().cuda()\n", + "beta_binomial_interpolator = BetaBinomialInterpolator()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "146a5027", + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(melsdir, exist_ok=True)\n", + "\n", + "# Train\n", + "train_datas = json_reader(train_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", + "\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(train_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", + "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", + "\n", + "for i, record in enumerate(tqdm(train_datas)):\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, train_datas, speaker_to_index, base_data_dir)\n", + "\n", + "json_writer(train_datas, train_manifest)\n", + "\n", + "\n", + "# Valid\n", + "valid_datas = json_reader(valid_manifest)\n", + "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", + "\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(valid_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", + "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", + "\n", + "for i, record in enumerate(tqdm(valid_datas)):\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, valid_datas, speaker_to_index, base_data_dir)\n", + "\n", + "json_writer(valid_datas, valid_manifest)" + ] + }, + { + "cell_type": "markdown", + "id": "59830d4b", + "metadata": {}, + "source": [ + "## b. Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3586b593", + "metadata": {}, + "outputs": [], + "source": [ + "!cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/hifigan/hifigan_44100.yaml\n", + "!cd {confdir} && mkdir -p model/train_ds && cd model/train_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml \n", + "!cd {confdir} && mkdir -p model/validation_ds && cd model/validation_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/validation_ds/val_ds_finetune.yaml\n", + "!cd {confdir} && mkdir -p model/generator && cd model/generator && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/generator/v1_44100.yaml\n", + "!cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/hifigan_finetune.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83ff072a", + "metadata": {}, + "outputs": [], + "source": [ + "# Normally 100 epochs\n", + "!(python {codedir}/hifigan_finetune.py \\\n", + "--config-path={os.path.abspath(confdir)} \\\n", + "--config-name=hifigan_44100.yaml \\\n", + "train_dataset={train_manifest} \\\n", + "validation_datasets={valid_manifest} \\\n", + "+init_from_pretrained_model=\"tts_en_hifitts_hifigan_ft_fastpitch\" \\\n", + "model.train_ds.dataloader_params.batch_size=32 \\\n", + "model.optim.lr=0.0001 \\\n", + "+trainer.max_epochs=5 \\\n", + "trainer.check_val_every_n_epoch=5 \\\n", + "model/train_ds=train_ds_finetune \\\n", + "model/validation_ds=val_ds_finetune \\\n", + "trainer.devices=-1 \\\n", + "trainer.strategy='ddp' \\\n", + "trainer.precision=16 \\\n", + "exp_manager.exp_dir={logsdir} \\\n", + "exp_manager.create_wandb_logger=True \\\n", + "exp_manager.wandb_logger_kwargs.name=\"tutorial-HiFiGAN-finetune-multispeaker\" \\\n", + "exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f999e8c", + "metadata": {}, + "outputs": [], + "source": [ + "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX-last.ckpt\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", + "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT" + ] + }, + { + "cell_type": "markdown", + "id": "b8da2e2b", + "metadata": {}, + "source": [ + "# 3. Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "495e095f", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.tts.models import HifiGanModel\n", + "import IPython.display as ipd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "9254244e", + "metadata": {}, + "source": [ + "## a. Load Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52ff6e88", + "metadata": {}, + "outputs": [], + "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a09cefcd", + "metadata": {}, + "outputs": [], + "source": [ + "# FastPitch\n", + "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT).eval().cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59277b2b", + "metadata": {}, + "outputs": [], + "source": [ + "# HiFiGAN\n", + "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT).eval().cuda()" + ] + }, + { + "cell_type": "markdown", + "id": "df5eae8c", + "metadata": {}, + "source": [ + "## b. Output Audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "234aea6d", + "metadata": {}, + "outputs": [], + "source": [ + "def gt_spectrogram(audio_path, wave_model, spec_gen_model):\n", + " features = wave_model.process(audio_path, trim=False)\n", + " audio, audio_length = features, torch.tensor(features.shape[0]).long()\n", + " audio = audio.unsqueeze(0).to(device=spec_gen_model.device)\n", + " audio_length = audio_length.unsqueeze(0).to(device=spec_gen_model.device)\n", + " with torch.no_grad():\n", + " spectrogram, spec_len = spec_gen_model.preprocessor(input_signal=audio, length=audio_length)\n", + " return spectrogram, spec_len\n", + "\n", + "def gen_spectrogram(text, spec_gen_model, speaker, reference_spec, reference_spec_lens):\n", + " parsed = spec_gen_model.parse(text)\n", + " speaker = torch.tensor([speaker]).long().to(device=spec_gen_model.device)\n", + " with torch.no_grad(): \n", + " spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, \n", + " speaker=speaker, \n", + " reference_spec=reference_spec, \n", + " reference_spec_lens=reference_spec_lens)\n", + "\n", + " return spectrogram\n", + " \n", + "def synth_audio(vocoder_model, spectrogram): \n", + " with torch.no_grad(): \n", + " audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)\n", + " if isinstance(audio, torch.Tensor):\n", + " audio = audio.to('cpu').numpy()\n", + " return audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf0be104", + "metadata": {}, + "outputs": [], + "source": [ + "# Reference Audio\n", + "reference_records = []\n", + "with open(train_manifest, \"r\") as f:\n", + " for i, line in enumerate(f):\n", + " reference_records.append(json.loads(line))\n", + "\n", + "speaker_to_index = defaultdict(list)\n", + "for i, d in enumerate(reference_records): speaker_to_index[d.get('speaker', None)].append(i)\n", + " \n", + "# Validatation Audio\n", + "num_val = 3\n", + "val_records = []\n", + "with open(valid_manifest, \"r\") as f:\n", + " for i, line in enumerate(f):\n", + " val_records.append(json.loads(line))\n", + " if len(val_records) >= num_val:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e3328e3", + "metadata": {}, + "outputs": [], + "source": [ + "for i, val_record in enumerate(val_records):\n", + " reference_record = reference_records[speaker_to_index[val_record['speaker']][0]]\n", + " reference_spec, reference_spec_lens = gt_spectrogram(reference_record['audio_filepath'], wave_model, spec_model)\n", + " reference_spec = reference_spec.to(spec_model.device)\n", + " spec_pred = gen_spectrogram(val_record['text'], \n", + " spec_model,\n", + " speaker=val_record['speaker'], \n", + " reference_spec=reference_spec, \n", + " reference_spec_lens=reference_spec_lens)\n", + "\n", + " audio_gen = synth_audio(vocoder_model, spec_pred)\n", + " \n", + " audio_ref = ipd.Audio(reference_record['audio_filepath'], rate=sample_rate)\n", + " audio_gt = ipd.Audio(val_record['audio_filepath'], rate=sample_rate)\n", + " audio_gen = ipd.Audio(audio_gen, rate=sample_rate)\n", + " \n", + " print(\"------\")\n", + " print(f\"Text: {val_record['text']}\")\n", + " print('Reference Audio')\n", + " ipd.display(audio_ref)\n", + " print('Ground Truth Audio')\n", + " ipd.display(audio_gt)\n", + " print('Synthesized Audio')\n", + " ipd.display(audio_gen)\n", + " plt.imshow(spec_pred[0].to('cpu').numpy(), origin=\"lower\", aspect=\"auto\")\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c7ca983", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c158077", + "metadata": {}, + "outputs": [], + "source": [ + "str(YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "685905cd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 76571f753296fb57ac43604f1ee46bbf7a1b79c1 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Mon, 17 Apr 2023 19:36:22 -0700 Subject: [PATCH 04/25] Update main tutorial Signed-off-by: hsiehjackson --- .../tts/FastPitch_Adapter_Finetuning.ipynb | 132 +++++++++--------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 124 ++++++++-------- 2 files changed, 129 insertions(+), 127 deletions(-) diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 1c9a377911af..26ef487830c4 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "bceec759", + "id": "d7754dcf", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "9363d17a", + "id": "91190b36", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f952558a", + "id": "71341824", "metadata": {}, "outputs": [], "source": [ @@ -58,7 +58,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", - "BRANCH = 'tts_fastpitch_speaker_encoder'\n", + "BRANCH = 'main'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b5c4698e", + "id": "587e2ccd", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65b37d86", + "id": "7877db29", "metadata": {}, "outputs": [], "source": [ @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d5ec43c", + "id": "deb47145", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a235bc58", + "id": "c7560cad", "metadata": {}, "outputs": [], "source": [ @@ -130,7 +130,7 @@ }, { "cell_type": "markdown", - "id": "7117eebe", + "id": "2c40e507", "metadata": {}, "source": [ "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7fe5e9da", + "id": "21f55cf8", "metadata": {}, "outputs": [], "source": [ @@ -150,7 +150,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17a3892c", + "id": "f88696b9", "metadata": {}, "outputs": [], "source": [ @@ -182,7 +182,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4f5390d2", + "id": "33bfaac9", "metadata": {}, "outputs": [], "source": [ @@ -194,19 +194,19 @@ { "cell_type": "code", "execution_count": null, - "id": "e07ac1c0", + "id": "12b65006", "metadata": {}, "outputs": [], "source": [ "shutil.copyfile(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT, \"FastPitch.pt\")\n", - "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"FastPitch.pt\"\n", "shutil.copyfile(YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT, \"HifiGan.pt\")\n", + "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"FastPitch.pt\"\n", "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = \"HifiGan.pt\"" ] }, { "cell_type": "markdown", - "id": "f32e7bb1", + "id": "4eb540c9", "metadata": {}, "source": [ "# 2. Fine-tune FastPitch on adaptation data" @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "7d45a5d4", + "id": "126e8a04", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -224,7 +224,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9799debe", + "id": "f1442ea2", "metadata": {}, "outputs": [], "source": [ @@ -234,7 +234,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aceea64f", + "id": "a14ef048", "metadata": {}, "outputs": [], "source": [ @@ -244,7 +244,7 @@ }, { "cell_type": "markdown", - "id": "6d64cb74", + "id": "951b5f1a", "metadata": {}, "source": [ "For simplicity, we use original dev set as training set and original test set as validation set." @@ -253,7 +253,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36ad65ae", + "id": "4d9fd8f2", "metadata": {}, "outputs": [], "source": [ @@ -263,7 +263,7 @@ }, { "cell_type": "markdown", - "id": "2cdcd15d", + "id": "c51e0e87", "metadata": {}, "source": [ "## b. Preprocessing" @@ -272,7 +272,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f95711cc", + "id": "d02029ed", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ }, { "cell_type": "markdown", - "id": "34452374", + "id": "cfcc7db6", "metadata": {}, "source": [ "### Add absolute file path in manifest" @@ -293,7 +293,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d89c1712", + "id": "c0700d93", "metadata": {}, "outputs": [], "source": [ @@ -311,7 +311,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a713d486", + "id": "643cf612", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +323,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2021c507", + "id": "9faa1fdd", "metadata": {}, "outputs": [], "source": [ @@ -334,7 +334,7 @@ }, { "cell_type": "markdown", - "id": "3ae42b30", + "id": "087c29e7", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0" @@ -343,7 +343,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ce86f9c5", + "id": "6f2dad27", "metadata": {}, "outputs": [], "source": [ @@ -358,7 +358,7 @@ }, { "cell_type": "markdown", - "id": "560f0f94", + "id": "7f9a7a6e", "metadata": {}, "source": [ "### Calculate Pitch Stats" @@ -367,7 +367,7 @@ { "cell_type": "code", "execution_count": null, - "id": "108be984", + "id": "8b7179f5", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd69c623", + "id": "fd41d74a", "metadata": {}, "outputs": [], "source": [ @@ -412,7 +412,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bc620c67", + "id": "016b039e", "metadata": {}, "outputs": [], "source": [ @@ -440,7 +440,7 @@ }, { "cell_type": "markdown", - "id": "c8eb5b1d", + "id": "cd87cceb", "metadata": {}, "source": [ "## c. Training" @@ -449,7 +449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4e3b0d80", + "id": "c7724add", "metadata": {}, "outputs": [], "source": [ @@ -459,7 +459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d26d2c2b", + "id": "1a969914", "metadata": {}, "outputs": [], "source": [ @@ -468,19 +468,18 @@ }, { "cell_type": "markdown", - "id": "8d21f6b5", + "id": "96071e54", "metadata": {}, "source": [ "### Important notes\n", "* **+init_from_ptl_ckpt**: initialize with a multi-speaker FastPitch checkpoint\n", - "* **model.n_speakers=1**: the number of speakers in the data. There is only 1 for now, \n", "* **~model.speaker_encoder.lookup_module**: remove the pre-trained looked-up speaker embedding" ] }, { "cell_type": "code", "execution_count": null, - "id": "2845624e", + "id": "a6f1cc94", "metadata": {}, "outputs": [], "source": [ @@ -498,7 +497,6 @@ "pitch_std={pitch_std} \\\n", "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", "heteronyms_path={normdir}/heteronyms-052722 \\\n", - "model.n_speakers=1 \\\n", "~model.speaker_encoder.lookup_module \\\n", "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", @@ -532,7 +530,7 @@ { "cell_type": "code", "execution_count": null, - "id": "44cbeee8", + "id": "31286dca", "metadata": {}, "outputs": [], "source": [ @@ -544,7 +542,7 @@ }, { "cell_type": "markdown", - "id": "b477be0c", + "id": "06e8f037", "metadata": {}, "source": [ "# 4. Fine-tune HiFiGAN on adaptation data" @@ -552,7 +550,7 @@ }, { "cell_type": "markdown", - "id": "6f690dbc", + "id": "40026019", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -561,7 +559,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bc2415f7", + "id": "3630a563", "metadata": {}, "outputs": [], "source": [ @@ -574,7 +572,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd8fb4b7", + "id": "ef6edacb", "metadata": {}, "outputs": [], "source": [ @@ -631,7 +629,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6e9a0d81", + "id": "a62bc5ff", "metadata": {}, "outputs": [], "source": [ @@ -640,8 +638,6 @@ "\n", "# Load Adapter Weights\n", "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", - "spec_model.freeze()\n", - "spec_model.unfreeze_enabled_adapters()\n", "spec_model.eval().cuda()\n", "\n", "beta_binomial_interpolator = BetaBinomialInterpolator()" @@ -650,7 +646,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd1b06e4", + "id": "c2a6e50d", "metadata": {}, "outputs": [], "source": [ @@ -684,7 +680,7 @@ }, { "cell_type": "markdown", - "id": "2ecf4794", + "id": "a68926d1", "metadata": {}, "source": [ "## b. Training" @@ -693,7 +689,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f599ff2b", + "id": "90ef58be", "metadata": {}, "outputs": [], "source": [ @@ -707,7 +703,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19de8999", + "id": "b7f8d543", "metadata": {}, "outputs": [], "source": [ @@ -737,7 +733,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d959a60f", + "id": "919a519d", "metadata": {}, "outputs": [], "source": [ @@ -749,7 +745,7 @@ }, { "cell_type": "markdown", - "id": "e476784f", + "id": "378fe2e3", "metadata": {}, "source": [ "# 3. Inference" @@ -758,7 +754,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6759111", + "id": "aabb1a66", "metadata": {}, "outputs": [], "source": [ @@ -769,7 +765,7 @@ }, { "cell_type": "markdown", - "id": "97736efb", + "id": "7274970d", "metadata": {}, "source": [ "## a. Load Model" @@ -778,7 +774,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a228df69", + "id": "4c32b949", "metadata": {}, "outputs": [], "source": [ @@ -788,22 +784,22 @@ { "cell_type": "code", "execution_count": null, - "id": "93ac4bdd", + "id": "75982012", "metadata": {}, "outputs": [], "source": [ "# FastPitch\n", "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", - "spec_model.freeze()\n", - "spec_model.unfreeze_enabled_adapters()\n", + "# spec_model.freeze()\n", + "# spec_model.unfreeze_enabled_adapters()\n", "spec_model = spec_model.eval().cuda()" ] }, { "cell_type": "code", "execution_count": null, - "id": "414c2710", + "id": "580fb6c6", "metadata": {}, "outputs": [], "source": [ @@ -813,7 +809,7 @@ }, { "cell_type": "markdown", - "id": "5cf1c315", + "id": "45533e1a", "metadata": {}, "source": [ "## b. Output Audio" @@ -822,7 +818,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ac94f171", + "id": "9e85db0f", "metadata": {}, "outputs": [], "source": [ @@ -855,7 +851,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cfe7c6b8", + "id": "78d23c8f", "metadata": {}, "outputs": [], "source": [ @@ -878,7 +874,7 @@ { "cell_type": "code", "execution_count": null, - "id": "375f77c9", + "id": "a8485d48", "metadata": {}, "outputs": [], "source": [ @@ -910,7 +906,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aa738cb2", + "id": "7c137592", "metadata": {}, "outputs": [], "source": [ @@ -920,7 +916,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c573b954", + "id": "081bee07", "metadata": {}, "outputs": [], "source": [ @@ -930,7 +926,7 @@ { "cell_type": "code", "execution_count": null, - "id": "805137d7", + "id": "997d2f41", "metadata": {}, "outputs": [], "source": [ @@ -940,7 +936,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b2038cf9", + "id": "0ac4b8d4", "metadata": {}, "outputs": [], "source": [] diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index 85df41e2979b..8bb4e27e5fcd 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "369c55f1", + "id": "13c04c94", "metadata": {}, "source": [ "# FastPitch MultiSpeaker Pretraining\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "a5f7fa32", + "id": "f0717c51", "metadata": {}, "source": [ "# License\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e7055e07", + "id": "136dd989", "metadata": {}, "outputs": [], "source": [ @@ -60,7 +60,7 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", - "BRANCH = 'tts_fastpitch_speaker_encoder'\n", + "BRANCH = 'main'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", @@ -70,7 +70,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a71b9226", + "id": "dd1f0fe2", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +80,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ad7daf8", + "id": "6364adf3", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +104,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e32d8df7", + "id": "633b93a7", "metadata": {}, "outputs": [], "source": [ @@ -120,7 +120,7 @@ }, { "cell_type": "markdown", - "id": "74f6a27e", + "id": "75c95297", "metadata": {}, "source": [ "# 1. Pre-train FastPitch on multi-speaker data" @@ -128,7 +128,7 @@ }, { "cell_type": "markdown", - "id": "92b28f3f", + "id": "312d1b49", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fcdec070", + "id": "394a25ed", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d313cda", + "id": "17cf9dae", "metadata": {}, "outputs": [], "source": [ @@ -159,7 +159,7 @@ }, { "cell_type": "markdown", - "id": "2466d86f", + "id": "f49d301f", "metadata": {}, "source": [ "For simplicity, we use original dev set as training set and original test set as validation set." @@ -168,7 +168,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ccb4c7c5", + "id": "f4711864", "metadata": {}, "outputs": [], "source": [ @@ -178,7 +178,7 @@ }, { "cell_type": "markdown", - "id": "262e7ef5", + "id": "342b056f", "metadata": {}, "source": [ "## b. Preprocessing" @@ -187,7 +187,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7bf29c9c", + "id": "bd08c5d8", "metadata": {}, "outputs": [], "source": [ @@ -199,7 +199,7 @@ }, { "cell_type": "markdown", - "id": "fd260436", + "id": "be281bd0", "metadata": {}, "source": [ "### Add absoluate audio path in manifest" @@ -208,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "08c0c87d", + "id": "f7f4b8a8", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32b2a36e", + "id": "eaa83d5a", "metadata": {}, "outputs": [], "source": [ @@ -241,7 +241,7 @@ }, { "cell_type": "markdown", - "id": "2417ad44", + "id": "954dc35b", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0" @@ -250,7 +250,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ad3d5037", + "id": "12b0d1ef", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ }, { "cell_type": "markdown", - "id": "7a2e96db", + "id": "1bc147e9", "metadata": {}, "source": [ "### Calculate Pitch Stats" @@ -275,7 +275,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6eb311c5", + "id": "002bb013", "metadata": {}, "outputs": [], "source": [ @@ -287,7 +287,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21777c97", + "id": "3aa5d471", "metadata": {}, "outputs": [], "source": [ @@ -320,7 +320,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b2a6f19", + "id": "d1b12283", "metadata": {}, "outputs": [], "source": [ @@ -348,7 +348,7 @@ }, { "cell_type": "markdown", - "id": "32425bca", + "id": "a33692c5", "metadata": {}, "source": [ "## c. Training" @@ -357,7 +357,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd3471cd", + "id": "022540fb", "metadata": {}, "outputs": [], "source": [ @@ -367,7 +367,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c71051e", + "id": "5b05c1ce", "metadata": {}, "outputs": [], "source": [ @@ -376,14 +376,14 @@ }, { "cell_type": "markdown", - "id": "792c0ba8", + "id": "33ce089f", "metadata": {}, "source": [ "### Important notes\n", - "* [Input] **speaker_id** in **sub_data_types**: each data has an unique speaker index (start from 0) in the input.\n", - "* [Input] **reference_audio** in **sub_data_types**: each data has a reference audio (from the same speaker) in the input.\n", - "* [Speaker] **model.n_speakers**: model gets the speaker size. \n", - "* [Speaker] **model.speaker_encoder.lookup_module**: model creates lookup table to extract speaker information from speaker id.\n", + "* [Data] **speaker_id** in **sup_data_types**: each data has an unique speaker index (start from 0) in the input.\n", + "* [Data] **reference_audio** in **sup_data_types**: each data has a reference audio (from the same speaker) in the input.\n", + "* [Speaker] **model.speaker_encoder.lookup_module**: model creates lookup table to get speaker embedding from speaker id.\n", + "* [Speaker] **model.speaker_encoder.lookup_module.n_speakers**: model gets the speaker size. \n", "* [Speaker] **model.speaker_encoder.gst_module**: model creates global style token to extract speaker information from reference audio.\n", "* [Condition] **condition_types=\"['add', 'layernorm']\"**: insert conditions with `add` operation to inputs and `layernorm` operation to layernorms." ] @@ -391,11 +391,13 @@ { "cell_type": "code", "execution_count": null, - "id": "34f50e59", + "id": "40e890fd", "metadata": {}, "outputs": [], "source": [ "# Normally 200 epochs\n", + "# `dataset.trim=True` only used for VCTK\n", + "# `dataset.trim_top_db=20` only used for VCTK\n", "!(python {codedir}/fastpitch.py \\\n", "--config-path={os.path.abspath(confdir)} \\\n", "--config-name=fastpitch_align_44100_adapter.yaml \\\n", @@ -409,8 +411,8 @@ "pitch_std={pitch_std} \\\n", "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", "heteronyms_path={normdir}/heteronyms-052722 \\\n", - "model.n_speakers=5 \\\n", - "model.speaker_encoder.lookup_module._target_=\"nemo.collections.tts.modules.submodules.LookupTable\" \\\n", + "model.speaker_encoder.lookup_module._target_=\"nemo.collections.tts.modules.submodules.SpeakerLookupTable\" \\\n", + "model.speaker_encoder.lookup_module.n_speakers=5 \\\n", "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", "model.output_fft.condition_types=\"['add', 'layernorm']\" \\\n", @@ -425,6 +427,10 @@ "model.validation_ds.dataset.max_duration=20 \\\n", "model.validation_ds.dataset.min_duration=0.1 \\\n", "+model.text_tokenizer.add_blank_at=True \\\n", + "model.train_ds.dataset.trim=True \\\n", + "model.validation_ds.dataset.trim=True \\\n", + "+model.train_ds.dataset.trim_top_db=20 \\\n", + "+model.validation_ds.dataset.trim_top_db=20 \\\n", "exp_manager.exp_dir={logsdir} \\\n", "+exp_manager.create_wandb_logger=True \\\n", "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-pretrain-multispeaker\" \\\n", @@ -441,7 +447,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c0a115ba", + "id": "594a12c7", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +459,7 @@ }, { "cell_type": "markdown", - "id": "b4c430d1", + "id": "21760d8c", "metadata": {}, "source": [ "# 2. Fine-tune HiFiGAN on multi-speaker data" @@ -461,7 +467,7 @@ }, { "cell_type": "markdown", - "id": "a12d558c", + "id": "059a5c57", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -470,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b49320ea", + "id": "f21ae142", "metadata": {}, "outputs": [], "source": [ @@ -483,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a503861d", + "id": "a0c916de", "metadata": {}, "outputs": [], "source": [ @@ -541,7 +547,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f947676c", + "id": "141303a7", "metadata": {}, "outputs": [], "source": [ @@ -552,7 +558,7 @@ { "cell_type": "code", "execution_count": null, - "id": "146a5027", + "id": "4ea69569", "metadata": {}, "outputs": [], "source": [ @@ -588,7 +594,7 @@ }, { "cell_type": "markdown", - "id": "59830d4b", + "id": "fceb5623", "metadata": {}, "source": [ "## b. Training" @@ -597,7 +603,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3586b593", + "id": "d0ca65ec", "metadata": {}, "outputs": [], "source": [ @@ -611,7 +617,7 @@ { "cell_type": "code", "execution_count": null, - "id": "83ff072a", + "id": "593513f2", "metadata": {}, "outputs": [], "source": [ @@ -641,7 +647,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f999e8c", + "id": "84fed050", "metadata": {}, "outputs": [], "source": [ @@ -653,7 +659,7 @@ }, { "cell_type": "markdown", - "id": "b8da2e2b", + "id": "f94cd841", "metadata": {}, "source": [ "# 3. Inference" @@ -662,7 +668,7 @@ { "cell_type": "code", "execution_count": null, - "id": "495e095f", + "id": "2807c212", "metadata": {}, "outputs": [], "source": [ @@ -673,7 +679,7 @@ }, { "cell_type": "markdown", - "id": "9254244e", + "id": "8dbac3ae", "metadata": {}, "source": [ "## a. Load Model" @@ -682,7 +688,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52ff6e88", + "id": "3e42dfe9", "metadata": {}, "outputs": [], "source": [ @@ -692,7 +698,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a09cefcd", + "id": "418c198e", "metadata": {}, "outputs": [], "source": [ @@ -703,7 +709,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59277b2b", + "id": "fca84cda", "metadata": {}, "outputs": [], "source": [ @@ -713,7 +719,7 @@ }, { "cell_type": "markdown", - "id": "df5eae8c", + "id": "1ef482fe", "metadata": {}, "source": [ "## b. Output Audio" @@ -722,7 +728,7 @@ { "cell_type": "code", "execution_count": null, - "id": "234aea6d", + "id": "b646d311", "metadata": {}, "outputs": [], "source": [ @@ -757,7 +763,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf0be104", + "id": "e5541924", "metadata": {}, "outputs": [], "source": [ @@ -783,7 +789,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e3328e3", + "id": "d76870d7", "metadata": {}, "outputs": [], "source": [ @@ -818,7 +824,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5c7ca983", + "id": "735bbbf9", "metadata": {}, "outputs": [], "source": [ @@ -828,7 +834,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c158077", + "id": "c7111d2a", "metadata": {}, "outputs": [], "source": [ @@ -838,7 +844,7 @@ { "cell_type": "code", "execution_count": null, - "id": "685905cd", + "id": "1be132a3", "metadata": {}, "outputs": [], "source": [] From c33c1884a69242bf15a94e75d82a4627231bda15 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Tue, 18 Apr 2023 15:36:39 -0700 Subject: [PATCH 05/25] Update tutorial Signed-off-by: hsiehjackson --- .../tts/FastPitch_Adapter_Finetuning.ipynb | 124 +++++++++--------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 115 ++++++++-------- 2 files changed, 116 insertions(+), 123 deletions(-) diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 26ef487830c4..3176c2e648f4 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "d7754dcf", + "id": "05aec279", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -17,18 +17,18 @@ "* Dataset Preparation: extract mel-spectrograms from fine-tuned FastPitch.\n", "* Training: fine-tune HiFiGAN with fine-tuned adaptation data.\n", "4. **Inference**: generate speech from adpated FastPitch\n", - "* Load Model: load pre-trained multi-speaker FastPitch with fine-tuned adapters.\n", + "* Load Model: load pre-trained multi-speaker FastPitch with **fine-tuned adapters**.\n", "* Output Audio: generate audio files." ] }, { "cell_type": "markdown", - "id": "91190b36", + "id": "e027e108", "metadata": {}, "source": [ "# License\n", "\n", - "> Copyright 2023 NVIDIA. All Rights Reserved.\n", + "> Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n", "> \n", "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", "> you may not use this file except in compliance with the License.\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "71341824", + "id": "e110c45f", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "587e2ccd", + "id": "2d0589c6", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7877db29", + "id": "fa18cbc6", "metadata": {}, "outputs": [], "source": [ @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "deb47145", + "id": "ca771e64", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c7560cad", + "id": "797c572f", "metadata": {}, "outputs": [], "source": [ @@ -130,7 +130,7 @@ }, { "cell_type": "markdown", - "id": "2c40e507", + "id": "11ed358f", "metadata": {}, "source": [ "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21f55cf8", + "id": "10c0ac90", "metadata": {}, "outputs": [], "source": [ @@ -150,7 +150,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f88696b9", + "id": "986f24da", "metadata": {}, "outputs": [], "source": [ @@ -182,7 +182,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33bfaac9", + "id": "34c20f4d", "metadata": {}, "outputs": [], "source": [ @@ -194,7 +194,7 @@ { "cell_type": "code", "execution_count": null, - "id": "12b65006", + "id": "1408d79e", "metadata": {}, "outputs": [], "source": [ @@ -206,7 +206,7 @@ }, { "cell_type": "markdown", - "id": "4eb540c9", + "id": "a6e53914", "metadata": {}, "source": [ "# 2. Fine-tune FastPitch on adaptation data" @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "126e8a04", + "id": "9c150400", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -224,7 +224,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f1442ea2", + "id": "8d115852", "metadata": {}, "outputs": [], "source": [ @@ -234,7 +234,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a14ef048", + "id": "ddc0e234", "metadata": {}, "outputs": [], "source": [ @@ -244,7 +244,7 @@ }, { "cell_type": "markdown", - "id": "951b5f1a", + "id": "d8217f2a", "metadata": {}, "source": [ "For simplicity, we use original dev set as training set and original test set as validation set." @@ -253,7 +253,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4d9fd8f2", + "id": "ce9133df", "metadata": {}, "outputs": [], "source": [ @@ -263,7 +263,7 @@ }, { "cell_type": "markdown", - "id": "c51e0e87", + "id": "7f3d8e85", "metadata": {}, "source": [ "## b. Preprocessing" @@ -272,7 +272,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d02029ed", + "id": "479a0091", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ }, { "cell_type": "markdown", - "id": "cfcc7db6", + "id": "36cf60b4", "metadata": {}, "source": [ "### Add absolute file path in manifest" @@ -293,7 +293,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c0700d93", + "id": "58d5edc2", "metadata": {}, "outputs": [], "source": [ @@ -311,7 +311,7 @@ { "cell_type": "code", "execution_count": null, - "id": "643cf612", + "id": "bbef786a", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +323,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9faa1fdd", + "id": "9d2ac995", "metadata": {}, "outputs": [], "source": [ @@ -334,7 +334,7 @@ }, { "cell_type": "markdown", - "id": "087c29e7", + "id": "ad52fefa", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0" @@ -343,7 +343,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6f2dad27", + "id": "b55f5957", "metadata": {}, "outputs": [], "source": [ @@ -358,7 +358,7 @@ }, { "cell_type": "markdown", - "id": "7f9a7a6e", + "id": "7ec9e1c3", "metadata": {}, "source": [ "### Calculate Pitch Stats" @@ -367,7 +367,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8b7179f5", + "id": "c089df5d", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd41d74a", + "id": "3e024c15", "metadata": {}, "outputs": [], "source": [ @@ -412,7 +412,7 @@ { "cell_type": "code", "execution_count": null, - "id": "016b039e", + "id": "2be656c0", "metadata": {}, "outputs": [], "source": [ @@ -440,7 +440,7 @@ }, { "cell_type": "markdown", - "id": "cd87cceb", + "id": "7e4a22b0", "metadata": {}, "source": [ "## c. Training" @@ -449,7 +449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c7724add", + "id": "97cc9209", "metadata": {}, "outputs": [], "source": [ @@ -459,7 +459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1a969914", + "id": "5a1b98c8", "metadata": {}, "outputs": [], "source": [ @@ -468,7 +468,7 @@ }, { "cell_type": "markdown", - "id": "96071e54", + "id": "42d04da0", "metadata": {}, "source": [ "### Important notes\n", @@ -479,7 +479,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6f1cc94", + "id": "3c744d5e", "metadata": {}, "outputs": [], "source": [ @@ -530,7 +530,7 @@ { "cell_type": "code", "execution_count": null, - "id": "31286dca", + "id": "727340b1", "metadata": {}, "outputs": [], "source": [ @@ -542,7 +542,7 @@ }, { "cell_type": "markdown", - "id": "06e8f037", + "id": "0e0d4817", "metadata": {}, "source": [ "# 4. Fine-tune HiFiGAN on adaptation data" @@ -550,7 +550,7 @@ }, { "cell_type": "markdown", - "id": "40026019", + "id": "8db17d0a", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -559,7 +559,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3630a563", + "id": "09c7b3fc", "metadata": {}, "outputs": [], "source": [ @@ -572,7 +572,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef6edacb", + "id": "7aabc329", "metadata": {}, "outputs": [], "source": [ @@ -629,7 +629,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a62bc5ff", + "id": "6a1ddb24", "metadata": {}, "outputs": [], "source": [ @@ -646,7 +646,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c2a6e50d", + "id": "e68fa20b", "metadata": {}, "outputs": [], "source": [ @@ -680,7 +680,7 @@ }, { "cell_type": "markdown", - "id": "a68926d1", + "id": "2d4c8389", "metadata": {}, "source": [ "## b. Training" @@ -689,7 +689,7 @@ { "cell_type": "code", "execution_count": null, - "id": "90ef58be", + "id": "a9d6337e", "metadata": {}, "outputs": [], "source": [ @@ -703,7 +703,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b7f8d543", + "id": "325e91e2", "metadata": {}, "outputs": [], "source": [ @@ -733,7 +733,7 @@ { "cell_type": "code", "execution_count": null, - "id": "919a519d", + "id": "53a71106", "metadata": {}, "outputs": [], "source": [ @@ -745,7 +745,7 @@ }, { "cell_type": "markdown", - "id": "378fe2e3", + "id": "a76168eb", "metadata": {}, "source": [ "# 3. Inference" @@ -754,7 +754,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aabb1a66", + "id": "c23cc443", "metadata": {}, "outputs": [], "source": [ @@ -765,7 +765,7 @@ }, { "cell_type": "markdown", - "id": "7274970d", + "id": "9c04b3d6", "metadata": {}, "source": [ "## a. Load Model" @@ -774,7 +774,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4c32b949", + "id": "d4a38a67", "metadata": {}, "outputs": [], "source": [ @@ -784,7 +784,7 @@ { "cell_type": "code", "execution_count": null, - "id": "75982012", + "id": "16bf1bf1", "metadata": {}, "outputs": [], "source": [ @@ -799,7 +799,7 @@ { "cell_type": "code", "execution_count": null, - "id": "580fb6c6", + "id": "2d577d41", "metadata": {}, "outputs": [], "source": [ @@ -809,7 +809,7 @@ }, { "cell_type": "markdown", - "id": "45533e1a", + "id": "f85299bf", "metadata": {}, "source": [ "## b. Output Audio" @@ -818,7 +818,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e85db0f", + "id": "532ebf2c", "metadata": {}, "outputs": [], "source": [ @@ -851,7 +851,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78d23c8f", + "id": "8d710a14", "metadata": {}, "outputs": [], "source": [ @@ -874,7 +874,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a8485d48", + "id": "43d36252", "metadata": {}, "outputs": [], "source": [ @@ -906,7 +906,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c137592", + "id": "f10420ea", "metadata": {}, "outputs": [], "source": [ @@ -916,7 +916,7 @@ { "cell_type": "code", "execution_count": null, - "id": "081bee07", + "id": "cedb0350", "metadata": {}, "outputs": [], "source": [ @@ -926,7 +926,7 @@ { "cell_type": "code", "execution_count": null, - "id": "997d2f41", + "id": "679d2695", "metadata": {}, "outputs": [], "source": [ @@ -936,7 +936,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0ac4b8d4", + "id": "8c664168", "metadata": {}, "outputs": [], "source": [] diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index 8bb4e27e5fcd..799e95e80a87 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "13c04c94", + "id": "bcbe848f", "metadata": {}, "source": [ "# FastPitch MultiSpeaker Pretraining\n", @@ -14,7 +14,7 @@ "* Training: pre-train multispeaker FastPitch\n", " * Input: we introduce additional speaker id and reference audio.\n", " * Speaker: we have looked-up speaker embedding and speaker encoder. \n", - " * Condition: we can condition pitch/duration predictors, mel-spectrogram decoder, aligner, and layernorm layers.\n", + " * Condition: we can condition phoneme encoder, pitch/duration predictors, mel-spectrogram decoder, aligner with add and layernorm operation.\n", "2. **Fine-tune HiFiGAN on multi-speaker data**: fine-tune a vocoder for the pre-trained multi-speaker FastPitch\n", "* Dataset Preparation: extract mel-spectrograms from pre-trained FastPitch.\n", "* Training: fine-tune HiFiGAN with pre-trained multi-speaker data.\n", @@ -25,12 +25,11 @@ }, { "cell_type": "markdown", - "id": "f0717c51", + "id": "6aa175ad", "metadata": {}, "source": [ "# License\n", - "\n", - "> Copyright 2023 NVIDIA. All Rights Reserved.\n", + "> Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n", "> \n", "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", "> you may not use this file except in compliance with the License.\n", @@ -48,7 +47,7 @@ { "cell_type": "code", "execution_count": null, - "id": "136dd989", + "id": "f2b90dcd", "metadata": {}, "outputs": [], "source": [ @@ -70,7 +69,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd1f0fe2", + "id": "50e34752", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +79,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6364adf3", + "id": "4f9105f2", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +103,7 @@ { "cell_type": "code", "execution_count": null, - "id": "633b93a7", + "id": "a3c46dab", "metadata": {}, "outputs": [], "source": [ @@ -120,7 +119,7 @@ }, { "cell_type": "markdown", - "id": "75c95297", + "id": "0c3478f9", "metadata": {}, "source": [ "# 1. Pre-train FastPitch on multi-speaker data" @@ -128,7 +127,7 @@ }, { "cell_type": "markdown", - "id": "312d1b49", + "id": "ba034e43", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -139,7 +138,7 @@ { "cell_type": "code", "execution_count": null, - "id": "394a25ed", + "id": "92ffc28d", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +148,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17cf9dae", + "id": "7758f0a1", "metadata": {}, "outputs": [], "source": [ @@ -159,7 +158,7 @@ }, { "cell_type": "markdown", - "id": "f49d301f", + "id": "b239e1d2", "metadata": {}, "source": [ "For simplicity, we use original dev set as training set and original test set as validation set." @@ -168,7 +167,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f4711864", + "id": "03c33763", "metadata": {}, "outputs": [], "source": [ @@ -178,7 +177,7 @@ }, { "cell_type": "markdown", - "id": "342b056f", + "id": "fc0824a5", "metadata": {}, "source": [ "## b. Preprocessing" @@ -187,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bd08c5d8", + "id": "3ac16425", "metadata": {}, "outputs": [], "source": [ @@ -199,7 +198,7 @@ }, { "cell_type": "markdown", - "id": "be281bd0", + "id": "29b11129", "metadata": {}, "source": [ "### Add absoluate audio path in manifest" @@ -208,7 +207,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f7f4b8a8", + "id": "8c062437", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +225,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eaa83d5a", + "id": "586e604a", "metadata": {}, "outputs": [], "source": [ @@ -241,7 +240,7 @@ }, { "cell_type": "markdown", - "id": "954dc35b", + "id": "89e10b53", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0" @@ -250,7 +249,7 @@ { "cell_type": "code", "execution_count": null, - "id": "12b0d1ef", + "id": "d0a0f23d", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +265,7 @@ }, { "cell_type": "markdown", - "id": "1bc147e9", + "id": "14c74cab", "metadata": {}, "source": [ "### Calculate Pitch Stats" @@ -275,7 +274,7 @@ { "cell_type": "code", "execution_count": null, - "id": "002bb013", + "id": "e3344194", "metadata": {}, "outputs": [], "source": [ @@ -287,7 +286,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3aa5d471", + "id": "899abf95", "metadata": {}, "outputs": [], "source": [ @@ -320,7 +319,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d1b12283", + "id": "c155d4b9", "metadata": {}, "outputs": [], "source": [ @@ -348,7 +347,7 @@ }, { "cell_type": "markdown", - "id": "a33692c5", + "id": "a41ee532", "metadata": {}, "source": [ "## c. Training" @@ -357,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "022540fb", + "id": "7f954beb", "metadata": {}, "outputs": [], "source": [ @@ -367,7 +366,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5b05c1ce", + "id": "e82e6860", "metadata": {}, "outputs": [], "source": [ @@ -376,7 +375,7 @@ }, { "cell_type": "markdown", - "id": "33ce089f", + "id": "12307264", "metadata": {}, "source": [ "### Important notes\n", @@ -391,13 +390,11 @@ { "cell_type": "code", "execution_count": null, - "id": "40e890fd", + "id": "6ffdce37", "metadata": {}, "outputs": [], "source": [ "# Normally 200 epochs\n", - "# `dataset.trim=True` only used for VCTK\n", - "# `dataset.trim_top_db=20` only used for VCTK\n", "!(python {codedir}/fastpitch.py \\\n", "--config-path={os.path.abspath(confdir)} \\\n", "--config-name=fastpitch_align_44100_adapter.yaml \\\n", @@ -427,10 +424,6 @@ "model.validation_ds.dataset.max_duration=20 \\\n", "model.validation_ds.dataset.min_duration=0.1 \\\n", "+model.text_tokenizer.add_blank_at=True \\\n", - "model.train_ds.dataset.trim=True \\\n", - "model.validation_ds.dataset.trim=True \\\n", - "+model.train_ds.dataset.trim_top_db=20 \\\n", - "+model.validation_ds.dataset.trim_top_db=20 \\\n", "exp_manager.exp_dir={logsdir} \\\n", "+exp_manager.create_wandb_logger=True \\\n", "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-pretrain-multispeaker\" \\\n", @@ -447,7 +440,7 @@ { "cell_type": "code", "execution_count": null, - "id": "594a12c7", + "id": "73bc625f", "metadata": {}, "outputs": [], "source": [ @@ -459,7 +452,7 @@ }, { "cell_type": "markdown", - "id": "21760d8c", + "id": "711a9913", "metadata": {}, "source": [ "# 2. Fine-tune HiFiGAN on multi-speaker data" @@ -467,7 +460,7 @@ }, { "cell_type": "markdown", - "id": "059a5c57", + "id": "88364f18", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -476,7 +469,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f21ae142", + "id": "ff07b4d2", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +482,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a0c916de", + "id": "68712a9d", "metadata": {}, "outputs": [], "source": [ @@ -547,7 +540,7 @@ { "cell_type": "code", "execution_count": null, - "id": "141303a7", + "id": "f4dbec65", "metadata": {}, "outputs": [], "source": [ @@ -558,7 +551,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4ea69569", + "id": "3b8809ee", "metadata": {}, "outputs": [], "source": [ @@ -594,7 +587,7 @@ }, { "cell_type": "markdown", - "id": "fceb5623", + "id": "d5740a15", "metadata": {}, "source": [ "## b. Training" @@ -603,7 +596,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d0ca65ec", + "id": "7af905c2", "metadata": {}, "outputs": [], "source": [ @@ -617,7 +610,7 @@ { "cell_type": "code", "execution_count": null, - "id": "593513f2", + "id": "0ab9fe5b", "metadata": {}, "outputs": [], "source": [ @@ -647,7 +640,7 @@ { "cell_type": "code", "execution_count": null, - "id": "84fed050", + "id": "2d427f5b", "metadata": {}, "outputs": [], "source": [ @@ -659,7 +652,7 @@ }, { "cell_type": "markdown", - "id": "f94cd841", + "id": "90ec6e21", "metadata": {}, "source": [ "# 3. Inference" @@ -668,7 +661,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2807c212", + "id": "a522779a", "metadata": {}, "outputs": [], "source": [ @@ -679,7 +672,7 @@ }, { "cell_type": "markdown", - "id": "8dbac3ae", + "id": "64490c8a", "metadata": {}, "source": [ "## a. Load Model" @@ -688,7 +681,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3e42dfe9", + "id": "51d78337", "metadata": {}, "outputs": [], "source": [ @@ -698,7 +691,7 @@ { "cell_type": "code", "execution_count": null, - "id": "418c198e", + "id": "b054e4a5", "metadata": {}, "outputs": [], "source": [ @@ -709,7 +702,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fca84cda", + "id": "00a8a5af", "metadata": {}, "outputs": [], "source": [ @@ -719,7 +712,7 @@ }, { "cell_type": "markdown", - "id": "1ef482fe", + "id": "b9c726e4", "metadata": {}, "source": [ "## b. Output Audio" @@ -728,7 +721,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b646d311", + "id": "d2974168", "metadata": {}, "outputs": [], "source": [ @@ -763,7 +756,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e5541924", + "id": "96fcb433", "metadata": {}, "outputs": [], "source": [ @@ -789,7 +782,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d76870d7", + "id": "7ae8f4e3", "metadata": {}, "outputs": [], "source": [ @@ -824,7 +817,7 @@ { "cell_type": "code", "execution_count": null, - "id": "735bbbf9", + "id": "884e6906", "metadata": {}, "outputs": [], "source": [ @@ -834,7 +827,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c7111d2a", + "id": "8d17f232", "metadata": {}, "outputs": [], "source": [ @@ -844,7 +837,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1be132a3", + "id": "7f49ee8a", "metadata": {}, "outputs": [], "source": [] From a57e12a1ef1893d5355a6f4c747c1ad88cc4964d Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Mon, 24 Apr 2023 12:12:00 -0700 Subject: [PATCH 06/25] Follow comments Signed-off-by: hsiehjackson --- .../tts/extract_sup_data.py | 16 +- .../tts/resynthesize_dataset.py | 12 + .../tts/FastPitch_Adapter_Finetuning.ipynb | 421 ++++++--------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 486 +++++++----------- 4 files changed, 372 insertions(+), 563 deletions(-) diff --git a/scripts/dataset_processing/tts/extract_sup_data.py b/scripts/dataset_processing/tts/extract_sup_data.py index 57fa220a733c..b82d3182e3b2 100644 --- a/scripts/dataset_processing/tts/extract_sup_data.py +++ b/scripts/dataset_processing/tts/extract_sup_data.py @@ -31,7 +31,7 @@ def get_pitch_stats(pitch_list): def preprocess_ds_for_fastpitch_align(dataloader): pitch_list = [] for batch in tqdm(dataloader, total=len(dataloader)): - audios, audio_lengths, tokens, tokens_lengths, align_prior_matrices, pitches, pitches_lengths = batch + pitches = batch["pitch"] pitch = pitches.squeeze(0) pitch_list.append(pitch[pitch != 0]) @@ -41,17 +41,7 @@ def preprocess_ds_for_fastpitch_align(dataloader): def preprocess_ds_for_mixer_tts_x(dataloader): pitch_list = [] for batch in tqdm(dataloader, total=len(dataloader)): - ( - audios, - audio_lengths, - tokens, - tokens_lengths, - align_prior_matrices, - pitches, - pitches_lengths, - lm_tokens, - ) = batch - + pitches = batch["pitch"] pitch = pitches.squeeze(0) pitch_list.append(pitch[pitch != 0]) @@ -60,7 +50,7 @@ def preprocess_ds_for_mixer_tts_x(dataloader): CFG_NAME2FUNC = { "ds_for_fastpitch_align": preprocess_ds_for_fastpitch_align, - "ds_for_mixer_tts": preprocess_ds_for_fastpitch_align, + "ds_for_mixer_tts": preprocess_ds_for_mixer_tts, "ds_for_mixer_tts_x": preprocess_ds_for_mixer_tts_x, } diff --git a/scripts/dataset_processing/tts/resynthesize_dataset.py b/scripts/dataset_processing/tts/resynthesize_dataset.py index cacd41e93109..996868625147 100644 --- a/scripts/dataset_processing/tts/resynthesize_dataset.py +++ b/scripts/dataset_processing/tts/resynthesize_dataset.py @@ -80,6 +80,7 @@ def chunks(iterable: Iterable, size: int) -> Iterator[List]: def load_model(path: Path, device: torch.device) -> SpectrogramGenerator: + print(path) model = None if path.suffix == ".nemo": model = SpectrogramGenerator.restore_from(path, map_location=device) @@ -117,6 +118,15 @@ def resynthesize_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: batch = to_device_recursive(batch, self.device) mels, mel_lens = self.model.preprocessor(input_signal=batch["audio"], length=batch["audio_lens"]) + + reference_audio = batch.get("reference_audio", None) + reference_audio_len = batch.get("reference_audio_lens", None) + reference_spec, reference_spec_len = None, None + if reference_audio is not None: + reference_spec, reference_spec_len = self.model.preprocessor( + input_signal=reference_audio, length=reference_audio_len + ) + outputs_tuple = self.model.forward( text=batch["text"], durs=None, @@ -127,6 +137,8 @@ def resynthesize_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: attn_prior=batch.get("attn_prior"), mel_lens=mel_lens, input_lens=batch["text_lens"], + reference_spec=reference_spec, + reference_spec_lens=reference_spec_len, ) names = self.model.fastpitch.output_types.keys() return {"spec": mels, "mel_lens": mel_lens, **dict(zip(names, outputs_tuple))} diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 3176c2e648f4..e47f1ae94228 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "05aec279", + "id": "82f4dfee", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -11,7 +11,7 @@ "1. **Transform pre-trained FastPitch checkpoint to adapter-compatible checkpoint**\n", "2. **Fine-tune FastPitch on adaptation data**: fine-tune pre-trained multi-speaker FastPitch for a new speaker\n", "* Dataset Preparation: download dataset and extract manifest files. (duration more than 15 mins)\n", - "* Preprocessing: add absolute audio paths in manifest, calculate pitch stats.\n", + "* Preprocessing: add absolute audio paths in manifest and extract Supplementary Data.\n", "* Training: fine-tune frozen multispeaker FastPitch with trainable adapters.\n", "3. **Fine-tune HiFiGAN on adaptation data**: fine-tune a vocoder for the fine-tuned multi-speaker FastPitch\n", "* Dataset Preparation: extract mel-spectrograms from fine-tuned FastPitch.\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "e027e108", + "id": "fabf7666", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e110c45f", + "id": "d6941063", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2d0589c6", + "id": "53792663", "metadata": {}, "outputs": [], "source": [ @@ -78,18 +78,18 @@ { "cell_type": "code", "execution_count": null, - "id": "fa18cbc6", + "id": "eb5ae13a", "metadata": {}, "outputs": [], "source": [ - "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"\"\n", - "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = \"\"" + "pretrained_fastpitch_checkpoint = \"\"\n", + "finetuned_hifigan_on_multispeaker_checkpoint = \"\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "ca771e64", + "id": "21528f24", "metadata": {}, "outputs": [], "source": [ @@ -98,12 +98,8 @@ "codedir = 'NeMoTTS' \n", "# Store all manifest and audios\n", "datadir = 'NeMoTTS_dataset'\n", - "# Store all related text-normalized files\n", - "normdir = 'NeMoTTS_normalize_files'\n", "# Store all supplementary files\n", "suppdir = \"NeMoTTS_sup_data\"\n", - "# Store all config files\n", - "confdir = \"NeMoTTS_conf\"\n", "# Store all training logs\n", "logsdir = \"NeMoTTS_logs\"\n", "# Store all mel-spectrograms for vocoder training\n", @@ -113,7 +109,19 @@ { "cell_type": "code", "execution_count": null, - "id": "797c572f", + "id": "ac1f2e95", + "metadata": {}, + "outputs": [], + "source": [ + "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", + "# comment out the below lines and set `codedir` to your local path.\n", + "# !git clone https://github.com/NVIDIA/NeMo.git {codedir}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26be2e0d", "metadata": {}, "outputs": [], "source": [ @@ -128,9 +136,28 @@ "from tqdm import tqdm" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "5018dffc", + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(codedir, exist_ok=True)\n", + "codedir = os.path.abspath(codedir)\n", + "os.makedirs(datadir, exist_ok=True)\n", + "datadir = os.path.abspath(datadir)\n", + "os.makedirs(suppdir, exist_ok=True)\n", + "suppdir = os.path.abspath(suppdir)\n", + "os.makedirs(logsdir, exist_ok=True)\n", + "logsdir = os.path.abspath(logsdir)\n", + "os.makedirs(melsdir, exist_ok=True)\n", + "melsdir = os.path.abspath(melsdir)" + ] + }, { "cell_type": "markdown", - "id": "11ed358f", + "id": "386399a5", "metadata": {}, "source": [ "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" @@ -139,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10c0ac90", + "id": "c8f67823", "metadata": {}, "outputs": [], "source": [ @@ -150,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "986f24da", + "id": "dfbef08c", "metadata": {}, "outputs": [], "source": [ @@ -182,31 +209,31 @@ { "cell_type": "code", "execution_count": null, - "id": "34c20f4d", + "id": "ea30c2cb", "metadata": {}, "outputs": [], "source": [ - "state = torch.load(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", + "state = torch.load(pretrained_fastpitch_checkpoint)\n", "state['hyper_parameters']['cfg'] = update_model_config_to_support_adapter(state['hyper_parameters']['cfg'])\n", - "torch.save(state, YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)" + "torch.save(state, pretrained_fastpitch_checkpoint)" ] }, { "cell_type": "code", "execution_count": null, - "id": "1408d79e", + "id": "2729d0a8", "metadata": {}, "outputs": [], "source": [ - "shutil.copyfile(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT, \"FastPitch.pt\")\n", - "shutil.copyfile(YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT, \"HifiGan.pt\")\n", - "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = \"FastPitch.pt\"\n", - "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = \"HifiGan.pt\"" + "shutil.copyfile(pretrained_fastpitch_checkpoint, \"FastPitch.ckpt\")\n", + "shutil.copyfile(finetuned_hifigan_on_multispeaker_checkpoint, \"HifiGan.ckpt\")\n", + "pretrained_fastpitch_checkpoint = os.path.abspath(\"FastPitch.ckpt\")\n", + "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(\"HifiGan.ckpt\")" ] }, { "cell_type": "markdown", - "id": "a6e53914", + "id": "f5c09c46", "metadata": {}, "source": [ "# 2. Fine-tune FastPitch on adaptation data" @@ -214,7 +241,7 @@ }, { "cell_type": "markdown", - "id": "9c150400", + "id": "1ec1439e", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -224,17 +251,17 @@ { "cell_type": "code", "execution_count": null, - "id": "8d115852", + "id": "bfa82a6e", "metadata": {}, "outputs": [], "source": [ - "!mkdir -p {datadir} && cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset.tar.gz && tar zxf vctk_subset.tar.gz" + "!cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset.tar.gz && tar zxf vctk_subset.tar.gz" ] }, { "cell_type": "code", "execution_count": null, - "id": "ddc0e234", + "id": "567bf4a4", "metadata": {}, "outputs": [], "source": [ @@ -242,18 +269,10 @@ "!ls {manidir}" ] }, - { - "cell_type": "markdown", - "id": "d8217f2a", - "metadata": {}, - "source": [ - "For simplicity, we use original dev set as training set and original test set as validation set." - ] - }, { "cell_type": "code", "execution_count": null, - "id": "ce9133df", + "id": "5ce43372", "metadata": {}, "outputs": [], "source": [ @@ -263,37 +282,25 @@ }, { "cell_type": "markdown", - "id": "7f3d8e85", + "id": "6fdc5d99", "metadata": {}, "source": [ "## b. Preprocessing" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "479a0091", - "metadata": {}, - "outputs": [], - "source": [ - "# additional files\n", - "!mkdir -p {normdir} && cd {normdir} \\\n", - "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", - "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-052722 \\" - ] - }, { "cell_type": "markdown", - "id": "36cf60b4", + "id": "4693c1ed", "metadata": {}, "source": [ - "### Add absolute file path in manifest" + "### Add absolute file path in manifest\n", + "We use absoluate path for audio_filepath to get the audio during training." ] }, { "cell_type": "code", "execution_count": null, - "id": "58d5edc2", + "id": "f1ce9c0c", "metadata": {}, "outputs": [], "source": [ @@ -311,7 +318,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bbef786a", + "id": "a927db03", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +330,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9d2ac995", + "id": "0d4fbf55", "metadata": {}, "outputs": [], "source": [ @@ -334,113 +341,63 @@ }, { "cell_type": "markdown", - "id": "ad52fefa", + "id": "540ea432", "metadata": {}, "source": [ - "### Calibrate speaker id to start from 0" + "### Extract Supplementary Data\n", + "\n", + "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script.\n", + "\n", + "Note: This is an optional step, if skipped, it will be automatically executed within the first epoch of training FastPitch." ] }, { "cell_type": "code", "execution_count": null, - "id": "b55f5957", + "id": "fe0a1295", "metadata": {}, "outputs": [], "source": [ - "train_datas = json_reader(train_manifest)\n", - "for m in train_datas: m['old_speaker'], m['speaker'] = m['speaker'], 0\n", - "json_writer(train_datas, train_manifest)\n", - "\n", - "valid_datas = json_reader(valid_manifest)\n", - "for m in valid_datas: m['old_speaker'], m['speaker'] = m['speaker'], 0\n", - "json_writer(valid_datas, valid_manifest)" + "!cd {codedir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", + " manifest_filepath={train_manifest} \\\n", + " sup_data_path={suppdir} \\\n", + " dataset.sample_rate={sample_rate} \\\n", + " dataset.n_fft=2048 \\\n", + " dataset.win_length=2048 \\\n", + " dataset.hop_length=512" ] }, { "cell_type": "markdown", - "id": "7ec9e1c3", + "id": "2fe3edd3", "metadata": {}, "source": [ - "### Calculate Pitch Stats" + "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", + "```bash\n", + "PITCH_MEAN=175.48513793945312, PITCH_STD=42.3786735534668\n", + "PITCH_MIN=65.4063949584961, PITCH_MAX=270.8517761230469\n", + "```" ] }, { "cell_type": "code", "execution_count": null, - "id": "c089df5d", + "id": "a816527f", "metadata": {}, "outputs": [], "source": [ - "import librosa\n", - "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", - "from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e024c15", - "metadata": {}, - "outputs": [], - "source": [ - "def get_pitch(sample): \n", - " rel_audio_path = Path(sample[\"audio_filepath\"]).relative_to(base_data_dir).with_suffix(\"\")\n", - " rel_audio_path_as_text_id = str(rel_audio_path).replace(\"/\", \"_\")\n", - " pitch_filepath = os.path.join(pitch_dir, f\"{rel_audio_path_as_text_id}.pt\")\n", - " \n", - " if os.path.exists(pitch_filepath):\n", - " pitch = torch.load(pitch_filepath).numpy()\n", - "\n", - " else:\n", - " features = wave_model.process(\n", - " sample[\"audio_filepath\"]\n", - " )\n", - " voiced_tuple = librosa.pyin(\n", - " features.numpy(),\n", - " fmin=librosa.note_to_hz('C2'),\n", - " fmax=librosa.note_to_hz('C7'),\n", - " frame_length=2048,\n", - " sr=44100,\n", - " fill_na=0.0,\n", - " )\n", - " pitch = voiced_tuple[0]\n", - " torch.save(torch.from_numpy(pitch).float(), pitch_filepath)\n", - " \n", - " return pitch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2be656c0", - "metadata": {}, - "outputs": [], - "source": [ - "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", - "pitch_dir = os.path.join(suppdir, 'pitch')\n", - "os.makedirs(suppdir, exist_ok=True)\n", - "os.makedirs(pitch_dir, exist_ok=True)\n", - "\n", - "train_pitchs = []\n", - "train_datas = json_reader(train_manifest)\n", - "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", - "for m in tqdm(train_datas): train_pitchs.append(get_pitch(m))\n", - " \n", - "valid_datas = json_reader(valid_manifest)\n", - "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", - "for m in tqdm(valid_datas): get_pitch(m)\n", - "\n", - "train_pitchs = np.concatenate(train_pitchs)\n", - "pitch_mean = float(np.mean(train_pitchs))\n", - "pitch_std = float(np.std(train_pitchs))\n", - "\n", - "with open(os.path.join(manidir, 'pitch_stats.json'), 'w') as f:\n", - " json.dump({'pitch':[pitch_mean, pitch_std]}, f)" + "!cd {codedir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", + " manifest_filepath={valid_manifest} \\\n", + " sup_data_path={suppdir} \\\n", + " dataset.sample_rate={sample_rate} \\\n", + " dataset.n_fft=2048 \\\n", + " dataset.win_length=2048 \\\n", + " dataset.hop_length=512" ] }, { "cell_type": "markdown", - "id": "7e4a22b0", + "id": "4f90d6a1", "metadata": {}, "source": [ "## c. Training" @@ -449,66 +406,52 @@ { "cell_type": "code", "execution_count": null, - "id": "97cc9209", + "id": "eaa29988", "metadata": {}, "outputs": [], "source": [ - "!mkdir -p {confdir} && cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/fastpitch_align_44100_adapter.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a1b98c8", - "metadata": {}, - "outputs": [], - "source": [ - "!cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/fastpitch_finetune_adapters.py" + "phoneme_dict_path = os.path.abspath(os.path.join(codedir, \"scripts\", \"tts_dataset_files\", \"cmudict-0.7b_nv22.10\"))\n", + "heteronyms_path = os.path.abspath(os.path.join(codedir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n", + "\n", + "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to overide pitch_mean and pitch_std configs below.\n", + "PITCH_MEAN=175.48513793945312\n", + "PITCH_STD=42.3786735534668" ] }, { "cell_type": "markdown", - "id": "42d04da0", + "id": "4bc250e8", "metadata": {}, "source": [ "### Important notes\n", - "* **+init_from_ptl_ckpt**: initialize with a multi-speaker FastPitch checkpoint\n", - "* **~model.speaker_encoder.lookup_module**: remove the pre-trained looked-up speaker embedding" + "* `+init_from_ptl_ckpt`: initialize with a multi-speaker FastPitch checkpoint\n", + "* `~model.speaker_encoder.lookup_module`: remove the pre-trained looked-up speaker embedding\n", + "* Other optional arguments based on your preference:\n", + " * batch_size\n", + " * exp_manager\n", + " * trainer" ] }, { "cell_type": "code", "execution_count": null, - "id": "3c744d5e", + "id": "062417ea", "metadata": {}, "outputs": [], "source": [ "# Normally 100 epochs (15 mins)\n", - "!(python {codedir}/fastpitch_finetune_adapters.py \\\n", - "--config-path={os.path.abspath(confdir)} \\\n", + "!cd {codedir} && python examples/tts/fastpitch_finetune_adapters.py \\\n", "--config-name=fastpitch_align_44100_adapter.yaml \\\n", - "+init_from_ptl_ckpt={YOUR_PRETRAINED_FASTPITCH_CHECKPOINT} \\\n", - "sample_rate=44100 \\\n", + "+init_from_ptl_ckpt={pretrained_fastpitch_checkpoint} \\\n", "train_dataset={train_manifest} \\\n", "validation_datasets={valid_manifest} \\\n", "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id', 'reference_audio']\" \\\n", "sup_data_path={suppdir} \\\n", - "pitch_mean={pitch_mean} \\\n", - "pitch_std={pitch_std} \\\n", - "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", - "heteronyms_path={normdir}/heteronyms-052722 \\\n", + "pitch_mean={PITCH_MEAN} \\\n", + "pitch_std={PITCH_STD} \\\n", "~model.speaker_encoder.lookup_module \\\n", - "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", - "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", - "model.output_fft.condition_types=\"['add', 'layernorm']\" \\\n", - "model.duration_predictor.condition_types=\"['add', 'layernorm']\" \\\n", - "model.pitch_predictor.condition_types=\"['add', 'layernorm']\" \\\n", - "model.alignment_module.condition_types=\"['add']\" \\\n", "model.train_ds.dataloader_params.batch_size=8 \\\n", "model.validation_ds.dataloader_params.batch_size=8 \\\n", - "model.train_ds.dataloader_params.num_workers=8 \\\n", - "model.validation_ds.dataloader_params.num_workers=8 \\\n", - "+model.text_tokenizer.add_blank_at=True \\\n", "model.optim.name=adam \\\n", "model.optim.lr=2e-4 \\\n", "model.optim.weight_decay=0.0 \\\n", @@ -523,26 +466,25 @@ "trainer.log_every_n_steps=1 \\\n", "trainer.devices=1 \\\n", "trainer.strategy=ddp \\\n", - "trainer.precision=32 \\\n", - ")" + "trainer.precision=32" ] }, { "cell_type": "code", "execution_count": null, - "id": "727340b1", + "id": "e44dacbf", "metadata": {}, "outputs": [], "source": [ - "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S\n", + "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch--val_loss=XXX-epoch=XXX-last.ckpt\n", "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", - "YOUR_FINETUNED_ADAPTER_CHECKPOINT = list(last_checkpoint_dir.glob('adapters.pt'))[0]\n", - "YOUR_FINETUNED_ADAPTER_CHECKPOINT" + "finetuned_adapter_checkpoint = list(last_checkpoint_dir.glob('adapters.pt'))[0]\n", + "print(finetuned_adapter_checkpoint)" ] }, { "cell_type": "markdown", - "id": "0e0d4817", + "id": "c0fee754", "metadata": {}, "source": [ "# 4. Fine-tune HiFiGAN on adaptation data" @@ -550,20 +492,22 @@ }, { "cell_type": "markdown", - "id": "8db17d0a", + "id": "787c6390", "metadata": {}, "source": [ - "## a. Dataset Preparation" + "## a. Dataset Preparation\n", + "Generate mel-spectrograms for HiFiGAN training." ] }, { "cell_type": "code", "execution_count": null, - "id": "09c7b3fc", + "id": "c831eb89", "metadata": {}, "outputs": [], "source": [ "from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator\n", + "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", "from nemo.collections.tts.models import FastPitchModel\n", "from collections import defaultdict\n", "import random" @@ -572,11 +516,11 @@ { "cell_type": "code", "execution_count": null, - "id": "7aabc329", + "id": "f44c889e", "metadata": {}, "outputs": [], "source": [ - "def gen_spectrogram(index, manifest, speaker_to_index, base_data_dir):\n", + "def gen_spectrogram(index, manifest, speaker_to_index):\n", " \n", " record = manifest[index]\n", " audio_file = record[\"audio_filepath\"]\n", @@ -600,6 +544,7 @@ " audio = wave_model.process(audio_file).unsqueeze(0).to(device=spec_model.device)\n", " audio_len = torch.tensor(audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", " spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len) \n", + " print(spect.shape, spect_len)\n", " \n", " attn_prior = torch.from_numpy(beta_binomial_interpolator(spect_len.item(), text_len.item())).unsqueeze(0).to(spec_model.device)\n", " \n", @@ -629,15 +574,17 @@ { "cell_type": "code", "execution_count": null, - "id": "6a1ddb24", + "id": "07dda4fd", "metadata": {}, "outputs": [], "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", + "\n", "# Pretrained FastPitch Weights\n", - "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", + "spec_model = FastPitchModel.load_from_checkpoint(pretrained_fastpitch_checkpoint)\n", "\n", "# Load Adapter Weights\n", - "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", + "spec_model.load_adapters(finetuned_adapter_checkpoint)\n", "spec_model.eval().cuda()\n", "\n", "beta_binomial_interpolator = BetaBinomialInterpolator()" @@ -646,7 +593,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e68fa20b", + "id": "2b5f69d7", "metadata": {}, "outputs": [], "source": [ @@ -654,33 +601,31 @@ "\n", "# Train\n", "train_datas = json_reader(train_manifest)\n", - "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", "speaker_to_index = defaultdict(list)\n", "for i, d in enumerate(train_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", "\n", "for i, record in enumerate(tqdm(train_datas)):\n", - " record[\"mel_filepath\"] = gen_spectrogram(i, train_datas, speaker_to_index, base_data_dir)\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, train_datas, speaker_to_index)\n", "\n", "json_writer(train_datas, train_manifest)\n", "\n", "\n", "# Valid\n", "valid_datas = json_reader(valid_manifest)\n", - "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", "speaker_to_index = defaultdict(list)\n", "for i, d in enumerate(valid_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", "\n", "for i, record in enumerate(tqdm(valid_datas)):\n", - " record[\"mel_filepath\"] = gen_spectrogram(i, valid_datas, speaker_to_index, base_data_dir)\n", + " record[\"mel_filepath\"] = gen_spectrogram(i, valid_datas, speaker_to_index)\n", "\n", "json_writer(valid_datas, valid_manifest)" ] }, { "cell_type": "markdown", - "id": "2d4c8389", + "id": "bcc7f2a5", "metadata": {}, "source": [ "## b. Training" @@ -689,63 +634,47 @@ { "cell_type": "code", "execution_count": null, - "id": "a9d6337e", - "metadata": {}, - "outputs": [], - "source": [ - "!cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/hifigan/hifigan_44100.yaml\n", - "!cd {confdir} && mkdir -p model/train_ds && cd model/train_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml \n", - "!cd {confdir} && mkdir -p model/validation_ds && cd model/validation_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/validation_ds/val_ds_finetune.yaml\n", - "!cd {confdir} && mkdir -p model/generator && cd model/generator && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/generator/v1_44100.yaml\n", - "!cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/hifigan_finetune.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "325e91e2", + "id": "7c5cd42b", "metadata": {}, "outputs": [], "source": [ "# Normally 500 epochs (30 mins)\n", - "!(python {codedir}/hifigan_finetune.py \\\n", - "--config-path={os.path.abspath(confdir)} \\\n", + "!cd {codedir} && python examples/tts/hifigan_finetune.py \\\n", "--config-name=hifigan_44100.yaml \\\n", "train_dataset={train_manifest} \\\n", "validation_datasets={valid_manifest} \\\n", - "+init_from_ptl_ckpt={YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT} \\\n", + "+init_from_ptl_ckpt={finetuned_hifigan_on_multispeaker_checkpoint} \\\n", "model.train_ds.dataloader_params.batch_size=32 \\\n", "model.optim.lr=0.0001 \\\n", - "+trainer.max_epochs=5 \\\n", - "trainer.check_val_every_n_epoch=5 \\\n", "model/train_ds=train_ds_finetune \\\n", "model/validation_ds=val_ds_finetune \\\n", - "trainer.devices=1 \\\n", + "+trainer.max_epochs=5 \\\n", + "trainer.check_val_every_n_epoch=5 \\\n", + "trainer.devices=-1 \\\n", "trainer.strategy='ddp' \\\n", "trainer.precision=16 \\\n", "exp_manager.exp_dir={logsdir} \\\n", "exp_manager.create_wandb_logger=True \\\n", - "exp_manager.wandb_logger_kwargs.name=\"tutorial-HiFiGAN-finetune-adaptation\" \\\n", - "exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", - ")" + "exp_manager.wandb_logger_kwargs.name=\"tutorial-HiFiGAN-finetune-multispeaker\" \\\n", + "exp_manager.wandb_logger_kwargs.project=\"NeMo\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "53a71106", + "id": "3e7a838e", "metadata": {}, "outputs": [], "source": [ "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX.ckpt\n", "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", - "YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", - "YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT" + "finetuned_hifigan_on_adaptation_checkpoint = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", + "finetuned_hifigan_on_adaptation_checkpoint" ] }, { "cell_type": "markdown", - "id": "a76168eb", + "id": "86cff00b", "metadata": {}, "source": [ "# 3. Inference" @@ -754,7 +683,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c23cc443", + "id": "bd0b5d12", "metadata": {}, "outputs": [], "source": [ @@ -765,7 +694,7 @@ }, { "cell_type": "markdown", - "id": "9c04b3d6", + "id": "0b3f773d", "metadata": {}, "source": [ "## a. Load Model" @@ -774,7 +703,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d4a38a67", + "id": "5f9ae03a", "metadata": {}, "outputs": [], "source": [ @@ -784,13 +713,13 @@ { "cell_type": "code", "execution_count": null, - "id": "16bf1bf1", + "id": "693c4576", "metadata": {}, "outputs": [], "source": [ "# FastPitch\n", - "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)\n", - "spec_model.load_adapters(YOUR_FINETUNED_ADAPTER_CHECKPOINT)\n", + "spec_model = FastPitchModel.load_from_checkpoint(pretrained_fastpitch_checkpoint)\n", + "spec_model.load_adapters(finetuned_adapter_checkpoint)\n", "# spec_model.freeze()\n", "# spec_model.unfreeze_enabled_adapters()\n", "spec_model = spec_model.eval().cuda()" @@ -799,17 +728,17 @@ { "cell_type": "code", "execution_count": null, - "id": "2d577d41", + "id": "64ce0e61", "metadata": {}, "outputs": [], "source": [ "# HiFiGAN\n", - "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT).eval().cuda()" + "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=finetuned_hifigan_on_adaptation_checkpoint).eval().cuda()" ] }, { "cell_type": "markdown", - "id": "f85299bf", + "id": "d1dae076", "metadata": {}, "source": [ "## b. Output Audio" @@ -818,7 +747,7 @@ { "cell_type": "code", "execution_count": null, - "id": "532ebf2c", + "id": "7f394699", "metadata": {}, "outputs": [], "source": [ @@ -851,7 +780,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8d710a14", + "id": "753b2811", "metadata": {}, "outputs": [], "source": [ @@ -874,7 +803,7 @@ { "cell_type": "code", "execution_count": null, - "id": "43d36252", + "id": "27b8b66d", "metadata": {}, "outputs": [], "source": [ @@ -906,37 +835,19 @@ { "cell_type": "code", "execution_count": null, - "id": "f10420ea", - "metadata": {}, - "outputs": [], - "source": [ - "str(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cedb0350", - "metadata": {}, - "outputs": [], - "source": [ - "str(YOUR_FINETUNED_ADAPTER_CHECKPOINT)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "679d2695", + "id": "36568855", "metadata": {}, "outputs": [], "source": [ - "str(YOUR_FINETUNED_HIFIGAN_ON_ADAPTATION_CHECKPOINT)" + "print(f\"FastPitch checkpoint: {pretrained_fastpitch_checkpoint}\")\n", + "print(f\"Adapter checkpoint: {finetuned_adapter_checkpoint}\")\n", + "print(f\"HiFi-Gan checkpoint: {finetuned_hifigan_on_adaptation_checkpoint}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "8c664168", + "id": "9434828c", "metadata": {}, "outputs": [], "source": [] @@ -958,7 +869,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index 799e95e80a87..cf804ba8c8ad 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "bcbe848f", + "id": "a29bfccd", "metadata": {}, "source": [ "# FastPitch MultiSpeaker Pretraining\n", @@ -10,11 +10,8 @@ "This notebook is designed to provide a guide on how to run FastPitch MultiSpeaker Pretraining Pipeline. It contains the following sections:\n", "1. **Pre-train FastPitch on multi-speaker data**: pre-train a multi-speaker FastPitch\n", "* Dataset Preparation: download dataset and extract manifest files.\n", - "* Preprocessing: add absolute audio paths in manifest, calculate pitch stats.\n", + "* Preprocessing: add absolute audio paths in manifest, calibrate speaker id to start from 0, and extract Supplementary Data.\n", "* Training: pre-train multispeaker FastPitch\n", - " * Input: we introduce additional speaker id and reference audio.\n", - " * Speaker: we have looked-up speaker embedding and speaker encoder. \n", - " * Condition: we can condition phoneme encoder, pitch/duration predictors, mel-spectrogram decoder, aligner with add and layernorm operation.\n", "2. **Fine-tune HiFiGAN on multi-speaker data**: fine-tune a vocoder for the pre-trained multi-speaker FastPitch\n", "* Dataset Preparation: extract mel-spectrograms from pre-trained FastPitch.\n", "* Training: fine-tune HiFiGAN with pre-trained multi-speaker data.\n", @@ -25,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "6aa175ad", + "id": "d14ad044", "metadata": {}, "source": [ "# License\n", @@ -47,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f2b90dcd", + "id": "9616932f", "metadata": {}, "outputs": [], "source": [ @@ -69,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50e34752", + "id": "67406c13", "metadata": {}, "outputs": [], "source": [ @@ -79,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4f9105f2", + "id": "fba188b2", "metadata": {}, "outputs": [], "source": [ @@ -88,12 +85,8 @@ "codedir = 'NeMoTTS' \n", "# Store all manifest and audios\n", "datadir = 'NeMoTTS_dataset'\n", - "# Store all related text-normalized files\n", - "normdir = 'NeMoTTS_normalize_files'\n", "# Store all supplementary files\n", "suppdir = \"NeMoTTS_sup_data\"\n", - "# Store all config files\n", - "confdir = \"NeMoTTS_conf\"\n", "# Store all training logs\n", "logsdir = \"NeMoTTS_logs\"\n", "# Store all mel-spectrograms for vocoder training\n", @@ -103,7 +96,19 @@ { "cell_type": "code", "execution_count": null, - "id": "a3c46dab", + "id": "9c3931cf", + "metadata": {}, + "outputs": [], + "source": [ + "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", + "# comment out the below lines and set `codedir` to your local path.\n", + "# !git clone https://github.com/NVIDIA/NeMo.git {codedir}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "141b7c04", "metadata": {}, "outputs": [], "source": [ @@ -117,9 +122,28 @@ "from tqdm import tqdm" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "8df0f121", + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(codedir, exist_ok=True)\n", + "codedir = os.path.abspath(codedir)\n", + "os.makedirs(datadir, exist_ok=True)\n", + "datadir = os.path.abspath(datadir)\n", + "os.makedirs(suppdir, exist_ok=True)\n", + "suppdir = os.path.abspath(suppdir)\n", + "os.makedirs(logsdir, exist_ok=True)\n", + "logsdir = os.path.abspath(logsdir)\n", + "os.makedirs(melsdir, exist_ok=True)\n", + "melsdir = os.path.abspath(melsdir)" + ] + }, { "cell_type": "markdown", - "id": "0c3478f9", + "id": "2724dc08", "metadata": {}, "source": [ "# 1. Pre-train FastPitch on multi-speaker data" @@ -127,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "ba034e43", + "id": "bc7a68d8", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -138,17 +162,17 @@ { "cell_type": "code", "execution_count": null, - "id": "92ffc28d", + "id": "0630588c", "metadata": {}, "outputs": [], "source": [ - "!mkdir -p {datadir} && cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset_multispeaker.tar.gz && tar zxf vctk_subset_multispeaker.tar.gz" + "!cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset_multispeaker.tar.gz && tar zxf vctk_subset_multispeaker.tar.gz" ] }, { "cell_type": "code", "execution_count": null, - "id": "7758f0a1", + "id": "6647d9f6", "metadata": {}, "outputs": [], "source": [ @@ -156,18 +180,10 @@ "!ls {manidir}" ] }, - { - "cell_type": "markdown", - "id": "b239e1d2", - "metadata": {}, - "source": [ - "For simplicity, we use original dev set as training set and original test set as validation set." - ] - }, { "cell_type": "code", "execution_count": null, - "id": "03c33763", + "id": "092c43e1", "metadata": {}, "outputs": [], "source": [ @@ -177,37 +193,25 @@ }, { "cell_type": "markdown", - "id": "fc0824a5", + "id": "9fecd29b", "metadata": {}, "source": [ "## b. Preprocessing" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ac16425", - "metadata": {}, - "outputs": [], - "source": [ - "# additional files\n", - "!mkdir -p {normdir} && cd {normdir} \\\n", - "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.10 \\\n", - "&& wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-052722 \\" - ] - }, { "cell_type": "markdown", - "id": "29b11129", + "id": "c0cedc10", "metadata": {}, "source": [ - "### Add absoluate audio path in manifest" + "### Add absoluate audio path in manifest\n", + "We use absoluate path for `audio_filepath` to get the audio during training." ] }, { "cell_type": "code", "execution_count": null, - "id": "8c062437", + "id": "2c66b0e3", "metadata": {}, "outputs": [], "source": [ @@ -225,7 +229,7 @@ { "cell_type": "code", "execution_count": null, - "id": "586e604a", + "id": "3358531c", "metadata": {}, "outputs": [], "source": [ @@ -240,16 +244,17 @@ }, { "cell_type": "markdown", - "id": "89e10b53", + "id": "43b2ddb9", "metadata": {}, "source": [ - "### Calibrate speaker id to start from 0" + "### Calibrate speaker id to start from 0\n", + "We use speaker id start from 0, so we can create a speaker look-up table with speaker size." ] }, { "cell_type": "code", "execution_count": null, - "id": "d0a0f23d", + "id": "e2484099", "metadata": {}, "outputs": [], "source": [ @@ -265,152 +270,148 @@ }, { "cell_type": "markdown", - "id": "14c74cab", + "id": "9da488d8", "metadata": {}, "source": [ - "### Calculate Pitch Stats" + "### Extract Supplementary Data\n", + "\n", + "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script.\n", + "\n", + "Note: This is an optional step, if skipped, it will be automatically executed within the first epoch of training FastPitch." ] }, { "cell_type": "code", "execution_count": null, - "id": "e3344194", + "id": "6977a7b6", "metadata": {}, "outputs": [], "source": [ - "import librosa\n", - "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", - "from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir" + "!cd {codedir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", + " manifest_filepath={train_manifest} \\\n", + " sup_data_path={suppdir} \\\n", + " dataset.sample_rate={sample_rate} \\\n", + " dataset.n_fft=2048 \\\n", + " dataset.win_length=2048 \\\n", + " dataset.hop_length=512" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "899abf95", + "cell_type": "markdown", + "id": "f8b9c6fe", "metadata": {}, - "outputs": [], "source": [ - "def get_pitch(sample): \n", - " rel_audio_path = Path(sample[\"audio_filepath\"]).relative_to(base_data_dir).with_suffix(\"\")\n", - " rel_audio_path_as_text_id = str(rel_audio_path).replace(\"/\", \"_\")\n", - " pitch_filepath = os.path.join(pitch_dir, f\"{rel_audio_path_as_text_id}.pt\")\n", - " \n", - " if os.path.exists(pitch_filepath):\n", - " pitch = torch.load(pitch_filepath).numpy()\n", - "\n", - " else:\n", - " features = wave_model.process(\n", - " sample[\"audio_filepath\"]\n", - " )\n", - " voiced_tuple = librosa.pyin(\n", - " features.numpy(),\n", - " fmin=librosa.note_to_hz('C2'),\n", - " fmax=librosa.note_to_hz('C7'),\n", - " frame_length=2048,\n", - " sr=sample_rate,\n", - " fill_na=0.0,\n", - " )\n", - " pitch = voiced_tuple[0]\n", - " torch.save(torch.from_numpy(pitch).float(), pitch_filepath)\n", - " \n", - " return pitch" + "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", + "```bash\n", + "PITCH_MEAN=140.84278869628906, PITCH_STD=50.97673034667969\n", + "PITCH_MIN=65.4063949584961, PITCH_MAX=285.3046875\n", + "```" ] }, { "cell_type": "code", "execution_count": null, - "id": "c155d4b9", + "id": "a202a440", "metadata": {}, "outputs": [], "source": [ - "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", - "pitch_dir = os.path.join(suppdir, 'pitch')\n", - "os.makedirs(suppdir, exist_ok=True)\n", - "os.makedirs(pitch_dir, exist_ok=True)\n", - "\n", - "train_pitchs = []\n", - "train_datas = json_reader(train_manifest)\n", - "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", - "for m in tqdm(train_datas): train_pitchs.append(get_pitch(m))\n", - " \n", - "valid_datas = json_reader(valid_manifest)\n", - "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", - "for m in tqdm(valid_datas): get_pitch(m)\n", - "\n", - "train_pitchs = np.concatenate(train_pitchs)\n", - "pitch_mean = float(np.mean(train_pitchs))\n", - "pitch_std = float(np.std(train_pitchs))\n", - "\n", - "with open(os.path.join(manidir, 'pitch_stats.json'), 'w') as f:\n", - " json.dump({'pitch':[pitch_mean, pitch_std]}, f)" + "!cd {codedir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", + " manifest_filepath={valid_manifest} \\\n", + " sup_data_path={suppdir} \\\n", + " dataset.sample_rate={sample_rate} \\\n", + " dataset.n_fft=2048 \\\n", + " dataset.win_length=2048 \\\n", + " dataset.hop_length=512" ] }, { "cell_type": "markdown", - "id": "a41ee532", + "id": "9fc6ac07", "metadata": {}, "source": [ - "## c. Training" + "* If you want to compute pitch mean and std for each speaker, you can use the script `compute_speaker_stats.py`\n", + "```bash\n", + "!cd {codedir} && python scripts/dataset_processing/tts/compute_speaker_stats.py \\\n", + " --manifest_path={train_manifest} \\\n", + " --sup_data_path={suppdir} \\\n", + " --pitch_stats_path={datadir}/pitch_stats.json\n", + "```" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "7f954beb", + "cell_type": "markdown", + "id": "118e4cde", "metadata": {}, - "outputs": [], "source": [ - "!mkdir -p {confdir} && cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/fastpitch_align_44100_adapter.yaml" + "## c. Training" ] }, { "cell_type": "code", "execution_count": null, - "id": "e82e6860", + "id": "150af3d8", "metadata": {}, "outputs": [], "source": [ - "!mkdir -p {codedir} && cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/fastpitch.py" + "phoneme_dict_path = os.path.abspath(os.path.join(codedir, \"scripts\", \"tts_dataset_files\", \"cmudict-0.7b_nv22.10\"))\n", + "heteronyms_path = os.path.abspath(os.path.join(codedir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n", + "\n", + "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to overide pitch_mean and pitch_std configs below.\n", + "PITCH_MEAN=140.84278869628906\n", + "PITCH_STD=65.4063949584961" ] }, { "cell_type": "markdown", - "id": "12307264", + "id": "346635df", "metadata": {}, "source": [ "### Important notes\n", - "* [Data] **speaker_id** in **sup_data_types**: each data has an unique speaker index (start from 0) in the input.\n", - "* [Data] **reference_audio** in **sup_data_types**: each data has a reference audio (from the same speaker) in the input.\n", - "* [Speaker] **model.speaker_encoder.lookup_module**: model creates lookup table to get speaker embedding from speaker id.\n", - "* [Speaker] **model.speaker_encoder.lookup_module.n_speakers**: model gets the speaker size. \n", - "* [Speaker] **model.speaker_encoder.gst_module**: model creates global style token to extract speaker information from reference audio.\n", - "* [Condition] **condition_types=\"['add', 'layernorm']\"**: insert conditions with `add` operation to inputs and `layernorm` operation to layernorms." + "* `sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id', 'reference_audio']\" `\n", + " * **speaker_id**: each data has an unique speaker index (start from 0) in the input.\n", + " * **reference_audio**: each data has a reference audio (from the same speaker) in the input.\n", + " \n", + "* `model.speaker_encoder.lookup_module.n_speakers`\n", + " * if use **model.speaker_encoder.lookup_module**, please give n_speakers to create the lookup table\n", + "\n", + "* `condition_types=\"['add', 'concat', layernorm']`\n", + " * use different operation type to condition module (e.g. input_fft/output_fft/duration_predictor/pitch_predictor/alignment_module)\n", + " * **add**: add conditions to module input\n", + " * **concat**: concat conditions to module input\n", + " * **layernorm**: scale and shift layernorm outputs based on conditions\n", + " \n", + "* Other default arguments in config:\n", + " * `model.speaker_encoder.lookup_module`: model creates lookup table to get speaker embedding from speaker id.\n", + " * `model.speaker_encoder.gst_module`: model creates global style token to extract speaker information from reference audio.\n", + "\n", + "* Other optional arguments based on your preference:\n", + " * batch_size\n", + " * max_duration\n", + " * min_duration\n", + " * exp_manager\n", + " * trainer" ] }, { "cell_type": "code", "execution_count": null, - "id": "6ffdce37", + "id": "66aac1bd", "metadata": {}, "outputs": [], "source": [ "# Normally 200 epochs\n", - "!(python {codedir}/fastpitch.py \\\n", - "--config-path={os.path.abspath(confdir)} \\\n", + "!(cd {codedir} && python examples/tts/fastpitch.py \\\n", "--config-name=fastpitch_align_44100_adapter.yaml \\\n", "+init_from_pretrained_model=\"tts_en_fastpitch\" \\\n", - "sample_rate={sample_rate} \\\n", "train_dataset={train_manifest} \\\n", "validation_datasets={valid_manifest} \\\n", - "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id','reference_audio']\" \\\n", + "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id', 'reference_audio']\" \\\n", "sup_data_path={suppdir} \\\n", - "pitch_mean={pitch_mean} \\\n", - "pitch_std={pitch_std} \\\n", - "phoneme_dict_path={normdir}/cmudict-0.7b_nv22.10 \\\n", - "heteronyms_path={normdir}/heteronyms-052722 \\\n", - "model.speaker_encoder.lookup_module._target_=\"nemo.collections.tts.modules.submodules.SpeakerLookupTable\" \\\n", + "pitch_mean={PITCH_MEAN} \\\n", + "pitch_std={PITCH_STD} \\\n", + "phoneme_dict_path={phoneme_dict_path} \\\n", + "heteronyms_path={heteronyms_path} \\\n", "model.speaker_encoder.lookup_module.n_speakers=5 \\\n", - "model.speaker_encoder.gst_module._target_=\"nemo.collections.tts.modules.submodules.GlobalStyleToken\" \\\n", "model.input_fft.condition_types=\"['add', 'layernorm']\" \\\n", "model.output_fft.condition_types=\"['add', 'layernorm']\" \\\n", "model.duration_predictor.condition_types=\"['add', 'layernorm']\" \\\n", @@ -418,12 +419,9 @@ "model.alignment_module.condition_types=\"['add']\" \\\n", "model.train_ds.dataloader_params.batch_size=8 \\\n", "model.validation_ds.dataloader_params.batch_size=8 \\\n", - "model.train_ds.dataloader_params.num_workers=8 \\\n", - "model.validation_ds.dataloader_params.num_workers=8 \\\n", "model.train_ds.dataset.max_duration=20 \\\n", "model.validation_ds.dataset.max_duration=20 \\\n", "model.validation_ds.dataset.min_duration=0.1 \\\n", - "+model.text_tokenizer.add_blank_at=True \\\n", "exp_manager.exp_dir={logsdir} \\\n", "+exp_manager.create_wandb_logger=True \\\n", "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-pretrain-multispeaker\" \\\n", @@ -440,19 +438,21 @@ { "cell_type": "code", "execution_count": null, - "id": "73bc625f", + "id": "50512025", "metadata": {}, "outputs": [], "source": [ "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch--val_loss=XXX-epoch=XXX-last.ckpt\n", "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", - "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", - "YOUR_PRETRAINED_FASTPITCH_CHECKPOINT" + "pretrained_fastpitch_checkpoint_ckpt = os.path.abspath(list(last_checkpoint_dir.glob('*-last.ckpt'))[0])\n", + "print(pretrained_fastpitch_checkpoint_ckpt)\n", + "pretrained_fastpitch_checkpoint_nemo = os.path.abspath(list(last_checkpoint_dir.glob('*.nemo'))[0])\n", + "print(pretrained_fastpitch_checkpoint_nemo)" ] }, { "cell_type": "markdown", - "id": "711a9913", + "id": "7f1f3b3f", "metadata": {}, "source": [ "# 2. Fine-tune HiFiGAN on multi-speaker data" @@ -460,7 +460,7 @@ }, { "cell_type": "markdown", - "id": "88364f18", + "id": "d89411fc", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -469,125 +469,43 @@ { "cell_type": "code", "execution_count": null, - "id": "ff07b4d2", + "id": "04a0e06e", "metadata": {}, "outputs": [], "source": [ - "from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator\n", - "from nemo.collections.tts.models import FastPitchModel\n", - "from collections import defaultdict\n", - "import random" + "!cd {codedir} \\\n", + "&& python scripts/dataset_processing/tts/resynthesize_dataset.py \\\n", + "--model-path={pretrained_fastpitch_checkpoint_nemo} \\\n", + "--input-json-manifest={train_manifest} \\\n", + "--input-sup-data-path={suppdir} \\\n", + "--output-folder={melsdir} \\\n", + "--device=\"cuda:0\" \\\n", + "--batch-size=1 \\\n", + "--num-workers=1 \\\n", + "&& python scripts/dataset_processing/tts/resynthesize_dataset.py \\\n", + "--model-path={pretrained_fastpitch_checkpoint_nemo} \\\n", + "--input-json-manifest={valid_manifest} \\\n", + "--input-sup-data-path={suppdir} \\\n", + "--output-folder={melsdir} \\\n", + "--device=\"cuda:0\" \\\n", + "--batch-size=1 \\\n", + "--num-workers=1" ] }, { "cell_type": "code", "execution_count": null, - "id": "68712a9d", + "id": "41aebdbe", "metadata": {}, "outputs": [], "source": [ - "def gen_spectrogram(index, manifest, speaker_to_index, base_data_dir):\n", - " \n", - " record = manifest[index]\n", - " audio_file = record[\"audio_filepath\"]\n", - " \n", - " if '.wav' in audio_file:\n", - " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".wav\", \".npy\")))\n", - " \n", - " if '.flac' in audio_file:\n", - " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".flac\", \".npy\")))\n", - " \n", - " if os.path.exists(save_path):\n", - " return save_path\n", - " \n", - " if \"normalized_text\" in record:\n", - " text = spec_model.parse(record[\"normalized_text\"], normalize=False)\n", - " else:\n", - " text = spec_model.parse(record['text'])\n", - " \n", - " text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=spec_model.device).unsqueeze(0)\n", - " \n", - " audio = wave_model.process(audio_file).unsqueeze(0).to(device=spec_model.device)\n", - " audio_len = torch.tensor(audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", - " spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len) \n", - " attn_prior = torch.from_numpy(beta_binomial_interpolator(spect_len.item(), text_len.item())).unsqueeze(0).to(spec_model.device)\n", - " \n", - " speaker = torch.tensor([record['speaker']]).to(spec_model.device)\n", - " \n", - " reference_pool = speaker_to_index[record[\"speaker\"]] - set([index]) if len(speaker_to_index[record[\"speaker\"]]) > 1 else speaker_to_index[record[\"speaker\"]]\n", - " reference_sample = manifest[random.sample(reference_pool, 1)[0]]\n", - " reference_audio = wave_model.process(reference_sample[\"audio_filepath\"]).unsqueeze(0).to(device=spec_model.device)\n", - " reference_audio_length = torch.tensor(reference_audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", - " reference_spec, reference_spec_len = spec_model.preprocessor(input_signal=reference_audio, length=reference_audio_length) \n", - " \n", - " with torch.no_grad():\n", - " spectrogram = spec_model.forward(\n", - " text=text, \n", - " input_lens=text_len,\n", - " spec=spect, \n", - " mel_lens=spect_len, \n", - " attn_prior=attn_prior,\n", - " speaker=speaker,\n", - " reference_spec=reference_spec,\n", - " reference_spec_lens=reference_spec_len\n", - " )[0]\n", - " \n", - " spec = spectrogram[0].to('cpu').numpy()\n", - " np.save(save_path, spec)\n", - " return save_path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4dbec65", - "metadata": {}, - "outputs": [], - "source": [ - "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT).eval().cuda()\n", - "beta_binomial_interpolator = BetaBinomialInterpolator()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3b8809ee", - "metadata": {}, - "outputs": [], - "source": [ - "os.makedirs(melsdir, exist_ok=True)\n", - "\n", - "# Train\n", - "train_datas = json_reader(train_manifest)\n", - "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in train_datas])\n", - "\n", - "speaker_to_index = defaultdict(list)\n", - "for i, d in enumerate(train_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", - "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", - "\n", - "for i, record in enumerate(tqdm(train_datas)):\n", - " record[\"mel_filepath\"] = gen_spectrogram(i, train_datas, speaker_to_index, base_data_dir)\n", - "\n", - "json_writer(train_datas, train_manifest)\n", - "\n", - "\n", - "# Valid\n", - "valid_datas = json_reader(valid_manifest)\n", - "base_data_dir = get_base_dir([item[\"audio_filepath\"] for item in valid_datas])\n", - "\n", - "speaker_to_index = defaultdict(list)\n", - "for i, d in enumerate(valid_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", - "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", - "\n", - "for i, record in enumerate(tqdm(valid_datas)):\n", - " record[\"mel_filepath\"] = gen_spectrogram(i, valid_datas, speaker_to_index, base_data_dir)\n", - "\n", - "json_writer(valid_datas, valid_manifest)" + "train_manifest_mel = f\"{melsdir}/train_mel.json\"\n", + "valid_manifest_mel = f\"{melsdir}/dev_mel.json\"" ] }, { "cell_type": "markdown", - "id": "d5740a15", + "id": "2a742d9f", "metadata": {}, "source": [ "## b. Training" @@ -596,63 +514,47 @@ { "cell_type": "code", "execution_count": null, - "id": "7af905c2", - "metadata": {}, - "outputs": [], - "source": [ - "!cd {confdir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/hifigan/hifigan_44100.yaml\n", - "!cd {confdir} && mkdir -p model/train_ds && cd model/train_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml \n", - "!cd {confdir} && mkdir -p model/validation_ds && cd model/validation_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/validation_ds/val_ds_finetune.yaml\n", - "!cd {confdir} && mkdir -p model/generator && cd model/generator && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/generator/v1_44100.yaml\n", - "!cd {codedir} && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/hifigan_finetune.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ab9fe5b", + "id": "bdd6e869", "metadata": {}, "outputs": [], "source": [ "# Normally 100 epochs\n", - "!(python {codedir}/hifigan_finetune.py \\\n", - "--config-path={os.path.abspath(confdir)} \\\n", + "!cd {codedir} && python examples/tts/hifigan_finetune.py \\\n", "--config-name=hifigan_44100.yaml \\\n", - "train_dataset={train_manifest} \\\n", - "validation_datasets={valid_manifest} \\\n", + "train_dataset={train_manifest_mel} \\\n", + "validation_datasets={valid_manifest_mel} \\\n", "+init_from_pretrained_model=\"tts_en_hifitts_hifigan_ft_fastpitch\" \\\n", "model.train_ds.dataloader_params.batch_size=32 \\\n", "model.optim.lr=0.0001 \\\n", - "+trainer.max_epochs=5 \\\n", - "trainer.check_val_every_n_epoch=5 \\\n", "model/train_ds=train_ds_finetune \\\n", "model/validation_ds=val_ds_finetune \\\n", + "+trainer.max_epochs=5 \\\n", + "trainer.check_val_every_n_epoch=5 \\\n", "trainer.devices=-1 \\\n", "trainer.strategy='ddp' \\\n", "trainer.precision=16 \\\n", "exp_manager.exp_dir={logsdir} \\\n", "exp_manager.create_wandb_logger=True \\\n", "exp_manager.wandb_logger_kwargs.name=\"tutorial-HiFiGAN-finetune-multispeaker\" \\\n", - "exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", - ")" + "exp_manager.wandb_logger_kwargs.project=\"NeMo\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "2d427f5b", + "id": "1a5e757a", "metadata": {}, "outputs": [], "source": [ "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX-last.ckpt\n", "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", - "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", - "YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT" + "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(list(last_checkpoint_dir.glob('*-last.ckpt'))[0])\n", + "finetuned_hifigan_on_multispeaker_checkpoint" ] }, { "cell_type": "markdown", - "id": "90ec6e21", + "id": "312937af", "metadata": {}, "source": [ "# 3. Inference" @@ -661,18 +563,21 @@ { "cell_type": "code", "execution_count": null, - "id": "a522779a", + "id": "99eed4f7", "metadata": {}, "outputs": [], "source": [ + "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", + "from nemo.collections.tts.models import FastPitchModel\n", "from nemo.collections.tts.models import HifiGanModel\n", + "from collections import defaultdict\n", "import IPython.display as ipd\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", - "id": "64490c8a", + "id": "1ec6e63c", "metadata": {}, "source": [ "## a. Load Model" @@ -681,7 +586,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51d78337", + "id": "57b38a34", "metadata": {}, "outputs": [], "source": [ @@ -691,28 +596,28 @@ { "cell_type": "code", "execution_count": null, - "id": "b054e4a5", + "id": "6e8844e2", "metadata": {}, "outputs": [], "source": [ "# FastPitch\n", - "spec_model = FastPitchModel.load_from_checkpoint(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT).eval().cuda()" + "spec_model = FastPitchModel.load_from_checkpoint(pretrained_fastpitch_checkpoint_ckpt).eval().cuda()" ] }, { "cell_type": "code", "execution_count": null, - "id": "00a8a5af", + "id": "56a57e20", "metadata": {}, "outputs": [], "source": [ "# HiFiGAN\n", - "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT).eval().cuda()" + "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=finetuned_hifigan_on_multispeaker_checkpoint).eval().cuda()" ] }, { "cell_type": "markdown", - "id": "b9c726e4", + "id": "c58b1f04", "metadata": {}, "source": [ "## b. Output Audio" @@ -721,7 +626,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d2974168", + "id": "0f0c4566", "metadata": {}, "outputs": [], "source": [ @@ -756,7 +661,7 @@ { "cell_type": "code", "execution_count": null, - "id": "96fcb433", + "id": "85dffa87", "metadata": {}, "outputs": [], "source": [ @@ -782,7 +687,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7ae8f4e3", + "id": "96e162e7", "metadata": {}, "outputs": [], "source": [ @@ -817,27 +722,18 @@ { "cell_type": "code", "execution_count": null, - "id": "884e6906", - "metadata": {}, - "outputs": [], - "source": [ - "str(YOUR_PRETRAINED_FASTPITCH_CHECKPOINT)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d17f232", + "id": "cb869af8", "metadata": {}, "outputs": [], "source": [ - "str(YOUR_FINETUNED_HIFIGAN_ON_MULTISPEAKER_CHECKPOINT)" + "print(f\"FastPitch checkpoint: {pretrained_fastpitch_checkpoint_ckpt}\")\n", + "print(f\"HiFi-Gan checkpoint: {finetuned_hifigan_on_multispeaker_checkpoint}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "7f49ee8a", + "id": "83da2e62", "metadata": {}, "outputs": [], "source": [] @@ -859,7 +755,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.13" } }, "nbformat": 4, From 2ad34f8d9da82f536f4e4969a9be05515a620b26 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Apr 2023 19:14:08 +0000 Subject: [PATCH 07/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/dataset_processing/tts/resynthesize_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/dataset_processing/tts/resynthesize_dataset.py b/scripts/dataset_processing/tts/resynthesize_dataset.py index 996868625147..cebccc882b27 100644 --- a/scripts/dataset_processing/tts/resynthesize_dataset.py +++ b/scripts/dataset_processing/tts/resynthesize_dataset.py @@ -118,15 +118,15 @@ def resynthesize_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: batch = to_device_recursive(batch, self.device) mels, mel_lens = self.model.preprocessor(input_signal=batch["audio"], length=batch["audio_lens"]) - + reference_audio = batch.get("reference_audio", None) reference_audio_len = batch.get("reference_audio_lens", None) reference_spec, reference_spec_len = None, None if reference_audio is not None: reference_spec, reference_spec_len = self.model.preprocessor( input_signal=reference_audio, length=reference_audio_len - ) - + ) + outputs_tuple = self.model.forward( text=batch["text"], durs=None, From 49f0f02e6d4dd36fc117a3a1c6b685bf5dbae088 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Mon, 24 Apr 2023 12:14:38 -0700 Subject: [PATCH 08/25] Follow comments Signed-off-by: hsiehjackson --- .../tts/FastPitch_Adapter_Finetuning.ipynb | 110 +++++++++--------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 98 ++++++++-------- 2 files changed, 104 insertions(+), 104 deletions(-) diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index e47f1ae94228..84f6e32c06b3 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "82f4dfee", + "id": "071b4407", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "fabf7666", + "id": "452d1fc2", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d6941063", + "id": "2d0dcd2b", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53792663", + "id": "167afb53", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eb5ae13a", + "id": "a21f51a8", "metadata": {}, "outputs": [], "source": [ @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21528f24", + "id": "fe206e0e", "metadata": {}, "outputs": [], "source": [ @@ -109,19 +109,19 @@ { "cell_type": "code", "execution_count": null, - "id": "ac1f2e95", + "id": "87094d41", "metadata": {}, "outputs": [], "source": [ "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", "# comment out the below lines and set `codedir` to your local path.\n", - "# !git clone https://github.com/NVIDIA/NeMo.git {codedir}" + "!git clone https://github.com/NVIDIA/NeMo.git {codedir}" ] }, { "cell_type": "code", "execution_count": null, - "id": "26be2e0d", + "id": "1af9c39f", "metadata": {}, "outputs": [], "source": [ @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5018dffc", + "id": "f400fe2e", "metadata": {}, "outputs": [], "source": [ @@ -157,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "386399a5", + "id": "2b4f491f", "metadata": {}, "source": [ "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" @@ -166,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c8f67823", + "id": "7cc52b7d", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dfbef08c", + "id": "8374c9b4", "metadata": {}, "outputs": [], "source": [ @@ -209,7 +209,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ea30c2cb", + "id": "c4309bd1", "metadata": {}, "outputs": [], "source": [ @@ -221,7 +221,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2729d0a8", + "id": "6a88cdbb", "metadata": {}, "outputs": [], "source": [ @@ -233,7 +233,7 @@ }, { "cell_type": "markdown", - "id": "f5c09c46", + "id": "073f5b85", "metadata": {}, "source": [ "# 2. Fine-tune FastPitch on adaptation data" @@ -241,7 +241,7 @@ }, { "cell_type": "markdown", - "id": "1ec1439e", + "id": "38ac5dcf", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -251,7 +251,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bfa82a6e", + "id": "7a091d3f", "metadata": {}, "outputs": [], "source": [ @@ -261,7 +261,7 @@ { "cell_type": "code", "execution_count": null, - "id": "567bf4a4", + "id": "894880cd", "metadata": {}, "outputs": [], "source": [ @@ -272,7 +272,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5ce43372", + "id": "48a6f03e", "metadata": {}, "outputs": [], "source": [ @@ -282,7 +282,7 @@ }, { "cell_type": "markdown", - "id": "6fdc5d99", + "id": "32686b67", "metadata": {}, "source": [ "## b. Preprocessing" @@ -290,7 +290,7 @@ }, { "cell_type": "markdown", - "id": "4693c1ed", + "id": "6fb24a9c", "metadata": {}, "source": [ "### Add absolute file path in manifest\n", @@ -300,7 +300,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f1ce9c0c", + "id": "b9826db8", "metadata": {}, "outputs": [], "source": [ @@ -318,7 +318,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a927db03", + "id": "f27c8fec", "metadata": {}, "outputs": [], "source": [ @@ -330,7 +330,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d4fbf55", + "id": "a9c18449", "metadata": {}, "outputs": [], "source": [ @@ -341,7 +341,7 @@ }, { "cell_type": "markdown", - "id": "540ea432", + "id": "db8f20b4", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -354,7 +354,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe0a1295", + "id": "df77c416", "metadata": {}, "outputs": [], "source": [ @@ -369,7 +369,7 @@ }, { "cell_type": "markdown", - "id": "2fe3edd3", + "id": "bafbf280", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -382,7 +382,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a816527f", + "id": "c9a73569", "metadata": {}, "outputs": [], "source": [ @@ -397,7 +397,7 @@ }, { "cell_type": "markdown", - "id": "4f90d6a1", + "id": "caa2da01", "metadata": {}, "source": [ "## c. Training" @@ -406,7 +406,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eaa29988", + "id": "3d6d51a1", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ }, { "cell_type": "markdown", - "id": "4bc250e8", + "id": "eecad436", "metadata": {}, "source": [ "### Important notes\n", @@ -435,7 +435,7 @@ { "cell_type": "code", "execution_count": null, - "id": "062417ea", + "id": "78680a01", "metadata": {}, "outputs": [], "source": [ @@ -472,7 +472,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e44dacbf", + "id": "ab5cba4f", "metadata": {}, "outputs": [], "source": [ @@ -484,7 +484,7 @@ }, { "cell_type": "markdown", - "id": "c0fee754", + "id": "ebaedb04", "metadata": {}, "source": [ "# 4. Fine-tune HiFiGAN on adaptation data" @@ -492,7 +492,7 @@ }, { "cell_type": "markdown", - "id": "787c6390", + "id": "60b81e1b", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -502,7 +502,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c831eb89", + "id": "16a38e1c", "metadata": {}, "outputs": [], "source": [ @@ -516,7 +516,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f44c889e", + "id": "75692e29", "metadata": {}, "outputs": [], "source": [ @@ -574,7 +574,7 @@ { "cell_type": "code", "execution_count": null, - "id": "07dda4fd", + "id": "cdfca5a3", "metadata": {}, "outputs": [], "source": [ @@ -593,7 +593,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2b5f69d7", + "id": "b49d1680", "metadata": {}, "outputs": [], "source": [ @@ -625,7 +625,7 @@ }, { "cell_type": "markdown", - "id": "bcc7f2a5", + "id": "2d9416b7", "metadata": {}, "source": [ "## b. Training" @@ -634,7 +634,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c5cd42b", + "id": "dfc6a221", "metadata": {}, "outputs": [], "source": [ @@ -662,7 +662,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3e7a838e", + "id": "d81bed4f", "metadata": {}, "outputs": [], "source": [ @@ -674,7 +674,7 @@ }, { "cell_type": "markdown", - "id": "86cff00b", + "id": "2d7dbbbd", "metadata": {}, "source": [ "# 3. Inference" @@ -683,7 +683,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bd0b5d12", + "id": "5065b0d2", "metadata": {}, "outputs": [], "source": [ @@ -694,7 +694,7 @@ }, { "cell_type": "markdown", - "id": "0b3f773d", + "id": "1d734b4f", "metadata": {}, "source": [ "## a. Load Model" @@ -703,7 +703,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5f9ae03a", + "id": "47c5983a", "metadata": {}, "outputs": [], "source": [ @@ -713,7 +713,7 @@ { "cell_type": "code", "execution_count": null, - "id": "693c4576", + "id": "d0a71553", "metadata": {}, "outputs": [], "source": [ @@ -728,7 +728,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64ce0e61", + "id": "8ccaf4e2", "metadata": {}, "outputs": [], "source": [ @@ -738,7 +738,7 @@ }, { "cell_type": "markdown", - "id": "d1dae076", + "id": "eebc7522", "metadata": {}, "source": [ "## b. Output Audio" @@ -747,7 +747,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f394699", + "id": "035cacf7", "metadata": {}, "outputs": [], "source": [ @@ -780,7 +780,7 @@ { "cell_type": "code", "execution_count": null, - "id": "753b2811", + "id": "5a187093", "metadata": {}, "outputs": [], "source": [ @@ -803,7 +803,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27b8b66d", + "id": "a5a1c936", "metadata": {}, "outputs": [], "source": [ @@ -835,7 +835,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36568855", + "id": "4db3d954", "metadata": {}, "outputs": [], "source": [ @@ -847,7 +847,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9434828c", + "id": "c81d1352", "metadata": {}, "outputs": [], "source": [] diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index cf804ba8c8ad..148f47c8390b 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a29bfccd", + "id": "e6f6329c", "metadata": {}, "source": [ "# FastPitch MultiSpeaker Pretraining\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "d14ad044", + "id": "2fb2f0b8", "metadata": {}, "source": [ "# License\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9616932f", + "id": "3fa40fa7", "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67406c13", + "id": "8cc9f3a2", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fba188b2", + "id": "3b674cfb", "metadata": {}, "outputs": [], "source": [ @@ -96,19 +96,19 @@ { "cell_type": "code", "execution_count": null, - "id": "9c3931cf", + "id": "aa143477", "metadata": {}, "outputs": [], "source": [ "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", "# comment out the below lines and set `codedir` to your local path.\n", - "# !git clone https://github.com/NVIDIA/NeMo.git {codedir}" + "!git clone https://github.com/NVIDIA/NeMo.git {codedir}" ] }, { "cell_type": "code", "execution_count": null, - "id": "141b7c04", + "id": "f15c1fe6", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +125,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8df0f121", + "id": "68e5a3ba", "metadata": {}, "outputs": [], "source": [ @@ -143,7 +143,7 @@ }, { "cell_type": "markdown", - "id": "2724dc08", + "id": "ab41f977", "metadata": {}, "source": [ "# 1. Pre-train FastPitch on multi-speaker data" @@ -151,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "bc7a68d8", + "id": "2a84e0b6", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0630588c", + "id": "a6323888", "metadata": {}, "outputs": [], "source": [ @@ -172,7 +172,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6647d9f6", + "id": "910254f0", "metadata": {}, "outputs": [], "source": [ @@ -183,7 +183,7 @@ { "cell_type": "code", "execution_count": null, - "id": "092c43e1", + "id": "fd8ed27a", "metadata": {}, "outputs": [], "source": [ @@ -193,7 +193,7 @@ }, { "cell_type": "markdown", - "id": "9fecd29b", + "id": "1347b6b0", "metadata": {}, "source": [ "## b. Preprocessing" @@ -201,7 +201,7 @@ }, { "cell_type": "markdown", - "id": "c0cedc10", + "id": "03fc2895", "metadata": {}, "source": [ "### Add absoluate audio path in manifest\n", @@ -211,7 +211,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2c66b0e3", + "id": "8f4b0267", "metadata": {}, "outputs": [], "source": [ @@ -229,7 +229,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3358531c", + "id": "bd860349", "metadata": {}, "outputs": [], "source": [ @@ -244,7 +244,7 @@ }, { "cell_type": "markdown", - "id": "43b2ddb9", + "id": "02208d95", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0\n", @@ -254,7 +254,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e2484099", + "id": "2f28d82c", "metadata": {}, "outputs": [], "source": [ @@ -270,7 +270,7 @@ }, { "cell_type": "markdown", - "id": "9da488d8", + "id": "edab7373", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -283,7 +283,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6977a7b6", + "id": "2c470e6c", "metadata": {}, "outputs": [], "source": [ @@ -298,7 +298,7 @@ }, { "cell_type": "markdown", - "id": "f8b9c6fe", + "id": "ad21df85", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -311,7 +311,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a202a440", + "id": "d1ed99d1", "metadata": {}, "outputs": [], "source": [ @@ -326,7 +326,7 @@ }, { "cell_type": "markdown", - "id": "9fc6ac07", + "id": "eca9b498", "metadata": {}, "source": [ "* If you want to compute pitch mean and std for each speaker, you can use the script `compute_speaker_stats.py`\n", @@ -340,7 +340,7 @@ }, { "cell_type": "markdown", - "id": "118e4cde", + "id": "9d786e75", "metadata": {}, "source": [ "## c. Training" @@ -349,7 +349,7 @@ { "cell_type": "code", "execution_count": null, - "id": "150af3d8", + "id": "d09fd354", "metadata": {}, "outputs": [], "source": [ @@ -363,7 +363,7 @@ }, { "cell_type": "markdown", - "id": "346635df", + "id": "f1affd0e", "metadata": {}, "source": [ "### Important notes\n", @@ -395,7 +395,7 @@ { "cell_type": "code", "execution_count": null, - "id": "66aac1bd", + "id": "035a989d", "metadata": {}, "outputs": [], "source": [ @@ -438,7 +438,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50512025", + "id": "ebc15094", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ }, { "cell_type": "markdown", - "id": "7f1f3b3f", + "id": "1f0f6353", "metadata": {}, "source": [ "# 2. Fine-tune HiFiGAN on multi-speaker data" @@ -460,7 +460,7 @@ }, { "cell_type": "markdown", - "id": "d89411fc", + "id": "12442284", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -469,7 +469,7 @@ { "cell_type": "code", "execution_count": null, - "id": "04a0e06e", + "id": "d7499ef4", "metadata": {}, "outputs": [], "source": [ @@ -495,7 +495,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41aebdbe", + "id": "e4b91945", "metadata": {}, "outputs": [], "source": [ @@ -505,7 +505,7 @@ }, { "cell_type": "markdown", - "id": "2a742d9f", + "id": "995dee11", "metadata": {}, "source": [ "## b. Training" @@ -514,7 +514,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bdd6e869", + "id": "525a4b58", "metadata": {}, "outputs": [], "source": [ @@ -542,7 +542,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1a5e757a", + "id": "f7e2d6ff", "metadata": {}, "outputs": [], "source": [ @@ -554,7 +554,7 @@ }, { "cell_type": "markdown", - "id": "312937af", + "id": "91724158", "metadata": {}, "source": [ "# 3. Inference" @@ -563,7 +563,7 @@ { "cell_type": "code", "execution_count": null, - "id": "99eed4f7", + "id": "bbd21bcf", "metadata": {}, "outputs": [], "source": [ @@ -577,7 +577,7 @@ }, { "cell_type": "markdown", - "id": "1ec6e63c", + "id": "2dd18808", "metadata": {}, "source": [ "## a. Load Model" @@ -586,7 +586,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57b38a34", + "id": "32b934e3", "metadata": {}, "outputs": [], "source": [ @@ -596,7 +596,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6e8844e2", + "id": "4bf4c97a", "metadata": {}, "outputs": [], "source": [ @@ -607,7 +607,7 @@ { "cell_type": "code", "execution_count": null, - "id": "56a57e20", + "id": "9cbab303", "metadata": {}, "outputs": [], "source": [ @@ -617,7 +617,7 @@ }, { "cell_type": "markdown", - "id": "c58b1f04", + "id": "1b88d0d7", "metadata": {}, "source": [ "## b. Output Audio" @@ -626,7 +626,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f0c4566", + "id": "1a2c47fb", "metadata": {}, "outputs": [], "source": [ @@ -661,7 +661,7 @@ { "cell_type": "code", "execution_count": null, - "id": "85dffa87", + "id": "afdca860", "metadata": {}, "outputs": [], "source": [ @@ -687,7 +687,7 @@ { "cell_type": "code", "execution_count": null, - "id": "96e162e7", + "id": "b45ba3af", "metadata": {}, "outputs": [], "source": [ @@ -722,7 +722,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cb869af8", + "id": "6aa522c5", "metadata": {}, "outputs": [], "source": [ @@ -733,7 +733,7 @@ { "cell_type": "code", "execution_count": null, - "id": "83da2e62", + "id": "d676763a", "metadata": {}, "outputs": [], "source": [] From 0631e5ee668fff34dfd868f46b7ed701922b9514 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Tue, 25 Apr 2023 17:48:04 -0700 Subject: [PATCH 09/25] Fix load .nemo error Signed-off-by: hsiehjackson --- nemo/collections/tts/models/fastpitch.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 5502e69a3111..39d2beac0c3f 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -182,6 +182,9 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): } if self.fastpitch.speaker_emb is not None: self.export_config["num_speakers"] = cfg.n_speakers + + # Adapter modules setup (from FastPitchAdapterModelMixin) + self.setup_adapters() def _get_default_text_tokenizer_conf(self): text_tokenizer: TextTokenizerConfig = TextTokenizerConfig() From d3e20d937a26ff55e7d93631bb220b2b75988eb5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Apr 2023 00:49:10 +0000 Subject: [PATCH 10/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/collections/tts/models/fastpitch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 39d2beac0c3f..28185c8f8622 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -182,7 +182,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): } if self.fastpitch.speaker_emb is not None: self.export_config["num_speakers"] = cfg.n_speakers - + # Adapter modules setup (from FastPitchAdapterModelMixin) self.setup_adapters() From ea85082d1ea4fa24ad732843b44e3dd3330c717e Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Tue, 25 Apr 2023 18:26:41 -0700 Subject: [PATCH 11/25] Support multi-speaker fine-tune Signed-off-by: hsiehjackson --- examples/tts/fastpitch_finetune_adapters.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/tts/fastpitch_finetune_adapters.py b/examples/tts/fastpitch_finetune_adapters.py index 396552b0f4fd..e3b5dd85561b 100644 --- a/examples/tts/fastpitch_finetune_adapters.py +++ b/examples/tts/fastpitch_finetune_adapters.py @@ -102,7 +102,12 @@ def main(cfg): # Freeze model model.freeze() - + + # Used if we fine-tune with multi-speaker dataset + if model.fastpitch.speaker_encoder is not None and model.fastpitch.speaker_encoder.lookup_module is not None: + for name, param in model.fastpitch.speaker_encoder.lookup_module.named_parameters(): + param.requires_grad = True + # Setup adapters if adapter_global_cfg is not None: add_global_adapter_cfg(model, adapter_global_cfg) From 7f7fa26b56d9f28ddd3b0f262e05591a8cf00128 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Apr 2023 01:27:55 +0000 Subject: [PATCH 12/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tts/fastpitch_finetune_adapters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tts/fastpitch_finetune_adapters.py b/examples/tts/fastpitch_finetune_adapters.py index e3b5dd85561b..b063a482f018 100644 --- a/examples/tts/fastpitch_finetune_adapters.py +++ b/examples/tts/fastpitch_finetune_adapters.py @@ -102,12 +102,12 @@ def main(cfg): # Freeze model model.freeze() - + # Used if we fine-tune with multi-speaker dataset if model.fastpitch.speaker_encoder is not None and model.fastpitch.speaker_encoder.lookup_module is not None: for name, param in model.fastpitch.speaker_encoder.lookup_module.named_parameters(): param.requires_grad = True - + # Setup adapters if adapter_global_cfg is not None: add_global_adapter_cfg(model, adapter_global_cfg) From 6dc0ff4b160c523ab106b5e2c80694cf52310b44 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Wed, 26 Apr 2023 20:19:43 -0700 Subject: [PATCH 13/25] Follow comments Signed-off-by: hsiehjackson --- .../tts/FastPitch_Adapter_Finetuning.ipynb | 256 +++++++++--------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 221 +++++++-------- 2 files changed, 224 insertions(+), 253 deletions(-) diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 84f6e32c06b3..5a02eed23fdf 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "071b4407", + "id": "1e065b5e", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "452d1fc2", + "id": "89eb3a1f", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2d0dcd2b", + "id": "95ccd2a1", "metadata": {}, "outputs": [], "source": [ @@ -58,17 +58,22 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", - "BRANCH = 'main'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", + "# BRANCH = 'main'\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", - "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", + "\n", + "# # Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", + "# # comment out the below lines and set `code_dir` to your local path.\n", + "code_dir = 'NeMoTTS' \n", + "!git clone https://github.com/NVIDIA/NeMo.git {code_dir}" ] }, { "cell_type": "code", "execution_count": null, - "id": "167afb53", + "id": "96c49a4c", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +83,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a21f51a8", + "id": "6037a349", "metadata": {}, "outputs": [], "source": [ @@ -89,39 +94,25 @@ { "cell_type": "code", "execution_count": null, - "id": "fe206e0e", + "id": "131451f0", "metadata": {}, "outputs": [], "source": [ "sample_rate = 44100\n", - "# Store all python script\n", - "codedir = 'NeMoTTS' \n", "# Store all manifest and audios\n", - "datadir = 'NeMoTTS_dataset'\n", + "data_dir = 'NeMoTTS_dataset'\n", "# Store all supplementary files\n", - "suppdir = \"NeMoTTS_sup_data\"\n", + "supp_dir = \"NeMoTTS_sup_data\"\n", "# Store all training logs\n", - "logsdir = \"NeMoTTS_logs\"\n", + "logs_dir = \"NeMoTTS_logs\"\n", "# Store all mel-spectrograms for vocoder training\n", - "melsdir = \"NeMoTTS_mels\"" + "mels_dir = \"NeMoTTS_mels\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "87094d41", - "metadata": {}, - "outputs": [], - "source": [ - "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", - "# comment out the below lines and set `codedir` to your local path.\n", - "!git clone https://github.com/NVIDIA/NeMo.git {codedir}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1af9c39f", + "id": "8d033777", "metadata": {}, "outputs": [], "source": [ @@ -139,25 +130,25 @@ { "cell_type": "code", "execution_count": null, - "id": "f400fe2e", + "id": "1c71085a", "metadata": {}, "outputs": [], "source": [ - "os.makedirs(codedir, exist_ok=True)\n", - "codedir = os.path.abspath(codedir)\n", - "os.makedirs(datadir, exist_ok=True)\n", - "datadir = os.path.abspath(datadir)\n", - "os.makedirs(suppdir, exist_ok=True)\n", - "suppdir = os.path.abspath(suppdir)\n", - "os.makedirs(logsdir, exist_ok=True)\n", - "logsdir = os.path.abspath(logsdir)\n", - "os.makedirs(melsdir, exist_ok=True)\n", - "melsdir = os.path.abspath(melsdir)" + "os.makedirs(code_dir, exist_ok=True)\n", + "code_dir = os.path.abspath(code_dir)\n", + "os.makedirs(data_dir, exist_ok=True)\n", + "data_dir = os.path.abspath(data_dir)\n", + "os.makedirs(supp_dir, exist_ok=True)\n", + "supp_dir = os.path.abspath(supp_dir)\n", + "os.makedirs(logs_dir, exist_ok=True)\n", + "logs_dir = os.path.abspath(logs_dir)\n", + "os.makedirs(mels_dir, exist_ok=True)\n", + "mels_dir = os.path.abspath(mels_dir)" ] }, { "cell_type": "markdown", - "id": "2b4f491f", + "id": "e52a7746", "metadata": {}, "source": [ "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" @@ -166,7 +157,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7cc52b7d", + "id": "f318ea3a", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +168,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8374c9b4", + "id": "b4c9f660", "metadata": {}, "outputs": [], "source": [ @@ -209,7 +200,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c4309bd1", + "id": "59e5d4e5", "metadata": {}, "outputs": [], "source": [ @@ -221,7 +212,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6a88cdbb", + "id": "45ba24e2", "metadata": {}, "outputs": [], "source": [ @@ -233,7 +224,7 @@ }, { "cell_type": "markdown", - "id": "073f5b85", + "id": "f7be0c03", "metadata": {}, "source": [ "# 2. Fine-tune FastPitch on adaptation data" @@ -241,7 +232,7 @@ }, { "cell_type": "markdown", - "id": "38ac5dcf", + "id": "e3d6e3e3", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -251,28 +242,28 @@ { "cell_type": "code", "execution_count": null, - "id": "7a091d3f", + "id": "6a92c174", "metadata": {}, "outputs": [], "source": [ - "!cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset.tar.gz && tar zxf vctk_subset.tar.gz" + "!cd {data_dir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset.tar.gz && tar zxf vctk_subset.tar.gz" ] }, { "cell_type": "code", "execution_count": null, - "id": "894880cd", + "id": "119970aa", "metadata": {}, "outputs": [], "source": [ - "manidir = f\"{datadir}/vctk_subset\"\n", + "manidir = f\"{data_dir}/vctk_subset\"\n", "!ls {manidir}" ] }, { "cell_type": "code", "execution_count": null, - "id": "48a6f03e", + "id": "6d46a308", "metadata": {}, "outputs": [], "source": [ @@ -282,7 +273,7 @@ }, { "cell_type": "markdown", - "id": "32686b67", + "id": "8540d872", "metadata": {}, "source": [ "## b. Preprocessing" @@ -290,7 +281,7 @@ }, { "cell_type": "markdown", - "id": "6fb24a9c", + "id": "ea07ff33", "metadata": {}, "source": [ "### Add absolute file path in manifest\n", @@ -300,48 +291,32 @@ { "cell_type": "code", "execution_count": null, - "id": "b9826db8", + "id": "98bb7ab6", "metadata": {}, "outputs": [], "source": [ - "def json_reader(filename):\n", - " lines = []\n", - " with open(filename) as f:\n", - " for line in f: lines.append(json.loads(line))\n", - " return lines\n", - "\n", - "def json_writer(manifest, filename):\n", - " with open(filename, 'w') as fout:\n", - " for m in manifest: fout.write(json.dumps(m) + '\\n') " + "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest" ] }, { "cell_type": "code", "execution_count": null, - "id": "f27c8fec", + "id": "cf7512c9", "metadata": {}, "outputs": [], "source": [ - "train_datas = json_reader(train_manifest)\n", + "train_datas = read_manifest(train_manifest)\n", "for m in train_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", - "json_writer(train_datas, train_manifest)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9c18449", - "metadata": {}, - "outputs": [], - "source": [ - "valid_datas = json_reader(valid_manifest)\n", + "write_manifest(train_manifest, train_datas)\n", + "\n", + "valid_datas = read_manifest(valid_manifest)\n", "for m in valid_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", - "json_writer(valid_datas, valid_manifest)" + "write_manifest(valid_manifest, valid_datas)" ] }, { "cell_type": "markdown", - "id": "db8f20b4", + "id": "50d529ea", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -354,13 +329,13 @@ { "cell_type": "code", "execution_count": null, - "id": "df77c416", + "id": "ca883965", "metadata": {}, "outputs": [], "source": [ - "!cd {codedir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", + "!cd {code_dir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", " manifest_filepath={train_manifest} \\\n", - " sup_data_path={suppdir} \\\n", + " sup_data_path={supp_dir} \\\n", " dataset.sample_rate={sample_rate} \\\n", " dataset.n_fft=2048 \\\n", " dataset.win_length=2048 \\\n", @@ -369,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "bafbf280", + "id": "dd5671bd", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -382,13 +357,13 @@ { "cell_type": "code", "execution_count": null, - "id": "c9a73569", + "id": "820b6fa2", "metadata": {}, "outputs": [], "source": [ - "!cd {codedir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", + "!cd {code_dir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", " manifest_filepath={valid_manifest} \\\n", - " sup_data_path={suppdir} \\\n", + " sup_data_path={supp_dir} \\\n", " dataset.sample_rate={sample_rate} \\\n", " dataset.n_fft=2048 \\\n", " dataset.win_length=2048 \\\n", @@ -397,7 +372,7 @@ }, { "cell_type": "markdown", - "id": "caa2da01", + "id": "aac60eb0", "metadata": {}, "source": [ "## c. Training" @@ -406,12 +381,12 @@ { "cell_type": "code", "execution_count": null, - "id": "3d6d51a1", + "id": "19c47142", "metadata": {}, "outputs": [], "source": [ - "phoneme_dict_path = os.path.abspath(os.path.join(codedir, \"scripts\", \"tts_dataset_files\", \"cmudict-0.7b_nv22.10\"))\n", - "heteronyms_path = os.path.abspath(os.path.join(codedir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n", + "phoneme_dict_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"cmudict-0.7b_nv22.10\"))\n", + "heteronyms_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n", "\n", "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to overide pitch_mean and pitch_std configs below.\n", "PITCH_MEAN=175.48513793945312\n", @@ -420,7 +395,7 @@ }, { "cell_type": "markdown", - "id": "eecad436", + "id": "6bfc7c70", "metadata": {}, "source": [ "### Important notes\n", @@ -435,18 +410,18 @@ { "cell_type": "code", "execution_count": null, - "id": "78680a01", + "id": "7e5ee24e", "metadata": {}, "outputs": [], "source": [ - "# Normally 100 epochs (15 mins)\n", - "!cd {codedir} && python examples/tts/fastpitch_finetune_adapters.py \\\n", + "# Normally 100 epochs\n", + "!cd {code_dir} && python examples/tts/fastpitch_finetune_adapters.py \\\n", "--config-name=fastpitch_align_44100_adapter.yaml \\\n", "+init_from_ptl_ckpt={pretrained_fastpitch_checkpoint} \\\n", "train_dataset={train_manifest} \\\n", "validation_datasets={valid_manifest} \\\n", "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id', 'reference_audio']\" \\\n", - "sup_data_path={suppdir} \\\n", + "sup_data_path={supp_dir} \\\n", "pitch_mean={PITCH_MEAN} \\\n", "pitch_std={PITCH_STD} \\\n", "~model.speaker_encoder.lookup_module \\\n", @@ -454,9 +429,8 @@ "model.validation_ds.dataloader_params.batch_size=8 \\\n", "model.optim.name=adam \\\n", "model.optim.lr=2e-4 \\\n", - "model.optim.weight_decay=0.0 \\\n", "~model.optim.sched \\\n", - "exp_manager.exp_dir={logsdir} \\\n", + "exp_manager.exp_dir={logs_dir} \\\n", "+exp_manager.create_wandb_logger=True \\\n", "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-finetune-adaptation\" \\\n", "+exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", @@ -472,19 +446,19 @@ { "cell_type": "code", "execution_count": null, - "id": "ab5cba4f", + "id": "8a298487", "metadata": {}, "outputs": [], "source": [ "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch--val_loss=XXX-epoch=XXX-last.ckpt\n", - "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logs_dir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", "finetuned_adapter_checkpoint = list(last_checkpoint_dir.glob('adapters.pt'))[0]\n", "print(finetuned_adapter_checkpoint)" ] }, { "cell_type": "markdown", - "id": "ebaedb04", + "id": "1f9cebb4", "metadata": {}, "source": [ "# 4. Fine-tune HiFiGAN on adaptation data" @@ -492,7 +466,7 @@ }, { "cell_type": "markdown", - "id": "60b81e1b", + "id": "bb1a64bb", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -502,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "16a38e1c", + "id": "d443cbf5", "metadata": {}, "outputs": [], "source": [ @@ -510,13 +484,14 @@ "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", "from nemo.collections.tts.models import FastPitchModel\n", "from collections import defaultdict\n", - "import random" + "import random\n", + "random.seed(100)" ] }, { "cell_type": "code", "execution_count": null, - "id": "75692e29", + "id": "fc742c61", "metadata": {}, "outputs": [], "source": [ @@ -526,10 +501,10 @@ " audio_file = record[\"audio_filepath\"]\n", " \n", " if '.wav' in audio_file:\n", - " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".wav\", \".npy\")))\n", + " save_path = os.path.abspath(os.path.join(mels_dir, audio_file.split(\"/\")[-1].replace(\".wav\", \".npy\")))\n", " \n", " if '.flac' in audio_file:\n", - " save_path = os.path.abspath(os.path.join(melsdir, audio_file.split(\"/\")[-1].replace(\".flac\", \".npy\")))\n", + " save_path = os.path.abspath(os.path.join(mels_dir, audio_file.split(\"/\")[-1].replace(\".flac\", \".npy\")))\n", " \n", " if os.path.exists(save_path):\n", " return save_path\n", @@ -544,7 +519,6 @@ " audio = wave_model.process(audio_file).unsqueeze(0).to(device=spec_model.device)\n", " audio_len = torch.tensor(audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", " spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len) \n", - " print(spect.shape, spect_len)\n", " \n", " attn_prior = torch.from_numpy(beta_binomial_interpolator(spect_len.item(), text_len.item())).unsqueeze(0).to(spec_model.device)\n", " \n", @@ -574,7 +548,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cdfca5a3", + "id": "d1ebcc29", "metadata": {}, "outputs": [], "source": [ @@ -593,14 +567,14 @@ { "cell_type": "code", "execution_count": null, - "id": "b49d1680", + "id": "4db5892a", "metadata": {}, "outputs": [], "source": [ - "os.makedirs(melsdir, exist_ok=True)\n", + "os.makedirs(mels_dir, exist_ok=True)\n", "\n", "# Train\n", - "train_datas = json_reader(train_manifest)\n", + "train_datas = read_manifest(train_manifest)\n", "speaker_to_index = defaultdict(list)\n", "for i, d in enumerate(train_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", @@ -608,11 +582,11 @@ "for i, record in enumerate(tqdm(train_datas)):\n", " record[\"mel_filepath\"] = gen_spectrogram(i, train_datas, speaker_to_index)\n", "\n", - "json_writer(train_datas, train_manifest)\n", + "write_manifest(train_manifest, train_datas)\n", "\n", "\n", "# Valid\n", - "valid_datas = json_reader(valid_manifest)\n", + "valid_datas = read_manifest(valid_manifest)\n", "speaker_to_index = defaultdict(list)\n", "for i, d in enumerate(valid_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", @@ -620,12 +594,12 @@ "for i, record in enumerate(tqdm(valid_datas)):\n", " record[\"mel_filepath\"] = gen_spectrogram(i, valid_datas, speaker_to_index)\n", "\n", - "json_writer(valid_datas, valid_manifest)" + "write_manifest(valid_manifest, valid_datas)" ] }, { "cell_type": "markdown", - "id": "2d9416b7", + "id": "daea80ef", "metadata": {}, "source": [ "## b. Training" @@ -634,12 +608,12 @@ { "cell_type": "code", "execution_count": null, - "id": "dfc6a221", + "id": "9e101e6b", "metadata": {}, "outputs": [], "source": [ - "# Normally 500 epochs (30 mins)\n", - "!cd {codedir} && python examples/tts/hifigan_finetune.py \\\n", + "# Normally 500 epochs\n", + "!cd {code_dir} && python examples/tts/hifigan_finetune.py \\\n", "--config-name=hifigan_44100.yaml \\\n", "train_dataset={train_manifest} \\\n", "validation_datasets={valid_manifest} \\\n", @@ -653,7 +627,7 @@ "trainer.devices=-1 \\\n", "trainer.strategy='ddp' \\\n", "trainer.precision=16 \\\n", - "exp_manager.exp_dir={logsdir} \\\n", + "exp_manager.exp_dir={logs_dir} \\\n", "exp_manager.create_wandb_logger=True \\\n", "exp_manager.wandb_logger_kwargs.name=\"tutorial-HiFiGAN-finetune-multispeaker\" \\\n", "exp_manager.wandb_logger_kwargs.project=\"NeMo\"" @@ -662,19 +636,19 @@ { "cell_type": "code", "execution_count": null, - "id": "d81bed4f", + "id": "0bb975e0", "metadata": {}, "outputs": [], "source": [ "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX.ckpt\n", - "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logs_dir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", "finetuned_hifigan_on_adaptation_checkpoint = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", "finetuned_hifigan_on_adaptation_checkpoint" ] }, { "cell_type": "markdown", - "id": "2d7dbbbd", + "id": "9e12f211", "metadata": {}, "source": [ "# 3. Inference" @@ -683,7 +657,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5065b0d2", + "id": "86c35868", "metadata": {}, "outputs": [], "source": [ @@ -694,7 +668,7 @@ }, { "cell_type": "markdown", - "id": "1d734b4f", + "id": "626525b0", "metadata": {}, "source": [ "## a. Load Model" @@ -703,7 +677,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47c5983a", + "id": "805b4909", "metadata": {}, "outputs": [], "source": [ @@ -713,22 +687,20 @@ { "cell_type": "code", "execution_count": null, - "id": "d0a71553", + "id": "17352083", "metadata": {}, "outputs": [], "source": [ "# FastPitch\n", "spec_model = FastPitchModel.load_from_checkpoint(pretrained_fastpitch_checkpoint)\n", "spec_model.load_adapters(finetuned_adapter_checkpoint)\n", - "# spec_model.freeze()\n", - "# spec_model.unfreeze_enabled_adapters()\n", "spec_model = spec_model.eval().cuda()" ] }, { "cell_type": "code", "execution_count": null, - "id": "8ccaf4e2", + "id": "d6c36a10", "metadata": {}, "outputs": [], "source": [ @@ -738,7 +710,7 @@ }, { "cell_type": "markdown", - "id": "eebc7522", + "id": "8a71d4d7", "metadata": {}, "source": [ "## b. Output Audio" @@ -747,7 +719,7 @@ { "cell_type": "code", "execution_count": null, - "id": "035cacf7", + "id": "eea31431", "metadata": {}, "outputs": [], "source": [ @@ -780,7 +752,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a187093", + "id": "fb0437a3", "metadata": {}, "outputs": [], "source": [ @@ -803,7 +775,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a5a1c936", + "id": "e6ddd463", "metadata": {}, "outputs": [], "source": [ @@ -835,7 +807,20 @@ { "cell_type": "code", "execution_count": null, - "id": "4db3d954", + "id": "e3342281", + "metadata": {}, + "outputs": [], + "source": [ + "fintuned_fastpitch = 'fastpitch.nemo'\n", + "fintuned_hifigan = 'hifigan.nemo'\n", + "spec_model.save_to(fintuned_fastpitch)\n", + "vocoder_model.save_to(fintuned_hifigan)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59bd98b4", "metadata": {}, "outputs": [], "source": [ @@ -847,10 +832,13 @@ { "cell_type": "code", "execution_count": null, - "id": "c81d1352", + "id": "6a65f215", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "print(f\"FastPitch nemo file: {fintuned_fastpitch}\")\n", + "print(f\"HiFi-Gan nemo file: {fintuned_hifigan}\")" + ] } ], "metadata": { diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index 148f47c8390b..1c8302400b42 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "e6f6329c", + "id": "8318580f", "metadata": {}, "source": [ "# FastPitch MultiSpeaker Pretraining\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "2fb2f0b8", + "id": "45f9b2a8", "metadata": {}, "source": [ "# License\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3fa40fa7", + "id": "2a06cd68", "metadata": {}, "outputs": [], "source": [ @@ -56,17 +56,22 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies# .\n", "\"\"\"\n", - "BRANCH = 'main'\n", + "# BRANCH = 'main'\n", "# # If you're using Colab and not running locally, uncomment and run this cell.\n", "# !apt-get install sox libsndfile1 ffmpeg\n", "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n", - "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", + "\n", + "# # Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", + "# # comment out the below lines and set `code_dir` to your local path.\n", + "code_dir = 'NeMoTTS' \n", + "!git clone https://github.com/NVIDIA/NeMo.git {code_dir}" ] }, { "cell_type": "code", "execution_count": null, - "id": "8cc9f3a2", + "id": "559da21f", "metadata": {}, "outputs": [], "source": [ @@ -76,39 +81,25 @@ { "cell_type": "code", "execution_count": null, - "id": "3b674cfb", + "id": "ef94a00e", "metadata": {}, "outputs": [], "source": [ "sample_rate = 44100\n", - "# Store all python script\n", - "codedir = 'NeMoTTS' \n", "# Store all manifest and audios\n", - "datadir = 'NeMoTTS_dataset'\n", + "data_dir = 'NeMoTTS_dataset'\n", "# Store all supplementary files\n", - "suppdir = \"NeMoTTS_sup_data\"\n", + "supp_dir = \"NeMoTTS_sup_data\"\n", "# Store all training logs\n", - "logsdir = \"NeMoTTS_logs\"\n", + "logs_dir = \"NeMoTTS_logs\"\n", "# Store all mel-spectrograms for vocoder training\n", - "melsdir = \"NeMoTTS_mels\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa143477", - "metadata": {}, - "outputs": [], - "source": [ - "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", - "# comment out the below lines and set `codedir` to your local path.\n", - "!git clone https://github.com/NVIDIA/NeMo.git {codedir}" + "mels_dir = \"NeMoTTS_mels\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "f15c1fe6", + "id": "ce5629c0", "metadata": {}, "outputs": [], "source": [ @@ -125,25 +116,25 @@ { "cell_type": "code", "execution_count": null, - "id": "68e5a3ba", + "id": "efe59789", "metadata": {}, "outputs": [], "source": [ - "os.makedirs(codedir, exist_ok=True)\n", - "codedir = os.path.abspath(codedir)\n", - "os.makedirs(datadir, exist_ok=True)\n", - "datadir = os.path.abspath(datadir)\n", - "os.makedirs(suppdir, exist_ok=True)\n", - "suppdir = os.path.abspath(suppdir)\n", - "os.makedirs(logsdir, exist_ok=True)\n", - "logsdir = os.path.abspath(logsdir)\n", - "os.makedirs(melsdir, exist_ok=True)\n", - "melsdir = os.path.abspath(melsdir)" + "os.makedirs(code_dir, exist_ok=True)\n", + "code_dir = os.path.abspath(code_dir)\n", + "os.makedirs(data_dir, exist_ok=True)\n", + "data_dir = os.path.abspath(data_dir)\n", + "os.makedirs(supp_dir, exist_ok=True)\n", + "supp_dir = os.path.abspath(supp_dir)\n", + "os.makedirs(logs_dir, exist_ok=True)\n", + "logs_dir = os.path.abspath(logs_dir)\n", + "os.makedirs(mels_dir, exist_ok=True)\n", + "mels_dir = os.path.abspath(mels_dir)" ] }, { "cell_type": "markdown", - "id": "ab41f977", + "id": "41610544", "metadata": {}, "source": [ "# 1. Pre-train FastPitch on multi-speaker data" @@ -151,7 +142,7 @@ }, { "cell_type": "markdown", - "id": "2a84e0b6", + "id": "b47540d5", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -162,28 +153,28 @@ { "cell_type": "code", "execution_count": null, - "id": "a6323888", + "id": "9bd53b0b", "metadata": {}, "outputs": [], "source": [ - "!cd {datadir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset_multispeaker.tar.gz && tar zxf vctk_subset_multispeaker.tar.gz" + "!cd {data_dir} && wget https://vctk-subset.s3.amazonaws.com/vctk_subset_multispeaker.tar.gz && tar zxf vctk_subset_multispeaker.tar.gz" ] }, { "cell_type": "code", "execution_count": null, - "id": "910254f0", + "id": "3d3190bd", "metadata": {}, "outputs": [], "source": [ - "manidir = f\"{datadir}/vctk_subset_multispeaker\"\n", + "manidir = f\"{data_dir}/vctk_subset_multispeaker\"\n", "!ls {manidir}" ] }, { "cell_type": "code", "execution_count": null, - "id": "fd8ed27a", + "id": "eb0e390f", "metadata": {}, "outputs": [], "source": [ @@ -193,7 +184,7 @@ }, { "cell_type": "markdown", - "id": "1347b6b0", + "id": "c86dd428", "metadata": {}, "source": [ "## b. Preprocessing" @@ -201,7 +192,7 @@ }, { "cell_type": "markdown", - "id": "03fc2895", + "id": "747ab8ac", "metadata": {}, "source": [ "### Add absoluate audio path in manifest\n", @@ -211,40 +202,32 @@ { "cell_type": "code", "execution_count": null, - "id": "8f4b0267", + "id": "6d5e8fc8", "metadata": {}, "outputs": [], "source": [ - "def json_reader(filename):\n", - " lines = []\n", - " with open(filename) as f:\n", - " for line in f: lines.append(json.loads(line))\n", - " return lines\n", - "\n", - "def json_writer(manifest, filename):\n", - " with open(filename, 'w') as fout:\n", - " for m in manifest: fout.write(json.dumps(m) + '\\n') " + "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest" ] }, { "cell_type": "code", "execution_count": null, - "id": "bd860349", + "id": "5883afc8", "metadata": {}, "outputs": [], "source": [ - "train_datas = json_reader(train_manifest)\n", + "train_datas = read_manifest(train_manifest)\n", "for m in train_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", - "json_writer(train_datas, train_manifest)\n", + "write_manifest(train_manifest, train_datas)\n", "\n", - "valid_datas = json_reader(valid_manifest)\n", + "valid_datas = read_manifest(valid_manifest)\n", "for m in valid_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", - "json_writer(valid_datas, valid_manifest)" + "write_manifest(valid_manifest, valid_datas)" ] }, { "cell_type": "markdown", - "id": "02208d95", + "id": "4d2d38a2", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0\n", @@ -254,23 +237,23 @@ { "cell_type": "code", "execution_count": null, - "id": "2f28d82c", + "id": "675fd58c", "metadata": {}, "outputs": [], "source": [ - "train_datas = json_reader(train_manifest)\n", + "train_datas = read_manifest(train_manifest)\n", "speaker2id = {s: _id for _id, s in enumerate(set([m['speaker'] for m in train_datas]))}\n", "for m in train_datas: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", - "json_writer(train_datas, train_manifest)\n", + "write_manifest(train_manifest, train_datas)\n", "\n", - "valid_datas = json_reader(valid_manifest)\n", + "valid_datas = read_manifest(valid_manifest)\n", "for m in valid_datas: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", - "json_writer(valid_datas, valid_manifest)" + "write_manifest(valid_manifest, valid_datas)" ] }, { "cell_type": "markdown", - "id": "edab7373", + "id": "81eeb8d8", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -283,13 +266,13 @@ { "cell_type": "code", "execution_count": null, - "id": "2c470e6c", + "id": "9ff88cf4", "metadata": {}, "outputs": [], "source": [ - "!cd {codedir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", + "!cd {code_dir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", " manifest_filepath={train_manifest} \\\n", - " sup_data_path={suppdir} \\\n", + " sup_data_path={supp_dir} \\\n", " dataset.sample_rate={sample_rate} \\\n", " dataset.n_fft=2048 \\\n", " dataset.win_length=2048 \\\n", @@ -298,7 +281,7 @@ }, { "cell_type": "markdown", - "id": "ad21df85", + "id": "e3b662a6", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -311,13 +294,13 @@ { "cell_type": "code", "execution_count": null, - "id": "d1ed99d1", + "id": "f6429c82", "metadata": {}, "outputs": [], "source": [ - "!cd {codedir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", + "!cd {code_dir} && python scripts/dataset_processing/tts/extract_sup_data.py \\\n", " manifest_filepath={valid_manifest} \\\n", - " sup_data_path={suppdir} \\\n", + " sup_data_path={supp_dir} \\\n", " dataset.sample_rate={sample_rate} \\\n", " dataset.n_fft=2048 \\\n", " dataset.win_length=2048 \\\n", @@ -326,21 +309,21 @@ }, { "cell_type": "markdown", - "id": "eca9b498", + "id": "d5ea1ac3", "metadata": {}, "source": [ "* If you want to compute pitch mean and std for each speaker, you can use the script `compute_speaker_stats.py`\n", "```bash\n", - "!cd {codedir} && python scripts/dataset_processing/tts/compute_speaker_stats.py \\\n", + "!cd {code_dir} && python scripts/dataset_processing/tts/compute_speaker_stats.py \\\n", " --manifest_path={train_manifest} \\\n", - " --sup_data_path={suppdir} \\\n", - " --pitch_stats_path={datadir}/pitch_stats.json\n", + " --sup_data_path={supp_dir} \\\n", + " --pitch_stats_path={data_dir}/pitch_stats.json\n", "```" ] }, { "cell_type": "markdown", - "id": "9d786e75", + "id": "a2b3c2a1", "metadata": {}, "source": [ "## c. Training" @@ -349,12 +332,12 @@ { "cell_type": "code", "execution_count": null, - "id": "d09fd354", + "id": "a6d5efb0", "metadata": {}, "outputs": [], "source": [ - "phoneme_dict_path = os.path.abspath(os.path.join(codedir, \"scripts\", \"tts_dataset_files\", \"cmudict-0.7b_nv22.10\"))\n", - "heteronyms_path = os.path.abspath(os.path.join(codedir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n", + "phoneme_dict_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"cmudict-0.7b_nv22.10\"))\n", + "heteronyms_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n", "\n", "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to overide pitch_mean and pitch_std configs below.\n", "PITCH_MEAN=140.84278869628906\n", @@ -363,7 +346,7 @@ }, { "cell_type": "markdown", - "id": "f1affd0e", + "id": "3b3f64bc", "metadata": {}, "source": [ "### Important notes\n", @@ -395,18 +378,18 @@ { "cell_type": "code", "execution_count": null, - "id": "035a989d", + "id": "f55c6ec8", "metadata": {}, "outputs": [], "source": [ "# Normally 200 epochs\n", - "!(cd {codedir} && python examples/tts/fastpitch.py \\\n", + "!(cd {code_dir} && python examples/tts/fastpitch.py \\\n", "--config-name=fastpitch_align_44100_adapter.yaml \\\n", "+init_from_pretrained_model=\"tts_en_fastpitch\" \\\n", "train_dataset={train_manifest} \\\n", "validation_datasets={valid_manifest} \\\n", "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id', 'reference_audio']\" \\\n", - "sup_data_path={suppdir} \\\n", + "sup_data_path={supp_dir} \\\n", "pitch_mean={PITCH_MEAN} \\\n", "pitch_std={PITCH_STD} \\\n", "phoneme_dict_path={phoneme_dict_path} \\\n", @@ -422,7 +405,7 @@ "model.train_ds.dataset.max_duration=20 \\\n", "model.validation_ds.dataset.max_duration=20 \\\n", "model.validation_ds.dataset.min_duration=0.1 \\\n", - "exp_manager.exp_dir={logsdir} \\\n", + "exp_manager.exp_dir={logs_dir} \\\n", "+exp_manager.create_wandb_logger=True \\\n", "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-pretrain-multispeaker\" \\\n", "+exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n", @@ -438,12 +421,12 @@ { "cell_type": "code", "execution_count": null, - "id": "ebc15094", + "id": "44a24d11", "metadata": {}, "outputs": [], "source": [ "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch--val_loss=XXX-epoch=XXX-last.ckpt\n", - "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logs_dir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", "pretrained_fastpitch_checkpoint_ckpt = os.path.abspath(list(last_checkpoint_dir.glob('*-last.ckpt'))[0])\n", "print(pretrained_fastpitch_checkpoint_ckpt)\n", "pretrained_fastpitch_checkpoint_nemo = os.path.abspath(list(last_checkpoint_dir.glob('*.nemo'))[0])\n", @@ -452,7 +435,7 @@ }, { "cell_type": "markdown", - "id": "1f0f6353", + "id": "16ce3e6f", "metadata": {}, "source": [ "# 2. Fine-tune HiFiGAN on multi-speaker data" @@ -460,7 +443,7 @@ }, { "cell_type": "markdown", - "id": "12442284", + "id": "b399346e", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -469,24 +452,24 @@ { "cell_type": "code", "execution_count": null, - "id": "d7499ef4", + "id": "fbef01c5", "metadata": {}, "outputs": [], "source": [ - "!cd {codedir} \\\n", + "!cd {code_dir} \\\n", "&& python scripts/dataset_processing/tts/resynthesize_dataset.py \\\n", "--model-path={pretrained_fastpitch_checkpoint_nemo} \\\n", "--input-json-manifest={train_manifest} \\\n", - "--input-sup-data-path={suppdir} \\\n", - "--output-folder={melsdir} \\\n", + "--input-sup-data-path={supp_dir} \\\n", + "--output-folder={mels_dir} \\\n", "--device=\"cuda:0\" \\\n", "--batch-size=1 \\\n", "--num-workers=1 \\\n", "&& python scripts/dataset_processing/tts/resynthesize_dataset.py \\\n", "--model-path={pretrained_fastpitch_checkpoint_nemo} \\\n", "--input-json-manifest={valid_manifest} \\\n", - "--input-sup-data-path={suppdir} \\\n", - "--output-folder={melsdir} \\\n", + "--input-sup-data-path={supp_dir} \\\n", + "--output-folder={mels_dir} \\\n", "--device=\"cuda:0\" \\\n", "--batch-size=1 \\\n", "--num-workers=1" @@ -495,17 +478,17 @@ { "cell_type": "code", "execution_count": null, - "id": "e4b91945", + "id": "e99d9af9", "metadata": {}, "outputs": [], "source": [ - "train_manifest_mel = f\"{melsdir}/train_mel.json\"\n", - "valid_manifest_mel = f\"{melsdir}/dev_mel.json\"" + "train_manifest_mel = f\"{mels_dir}/train_mel.json\"\n", + "valid_manifest_mel = f\"{mels_dir}/dev_mel.json\"" ] }, { "cell_type": "markdown", - "id": "995dee11", + "id": "1cac303b", "metadata": {}, "source": [ "## b. Training" @@ -514,12 +497,12 @@ { "cell_type": "code", "execution_count": null, - "id": "525a4b58", + "id": "05037d1f", "metadata": {}, "outputs": [], "source": [ "# Normally 100 epochs\n", - "!cd {codedir} && python examples/tts/hifigan_finetune.py \\\n", + "!cd {code_dir} && python examples/tts/hifigan_finetune.py \\\n", "--config-name=hifigan_44100.yaml \\\n", "train_dataset={train_manifest_mel} \\\n", "validation_datasets={valid_manifest_mel} \\\n", @@ -533,7 +516,7 @@ "trainer.devices=-1 \\\n", "trainer.strategy='ddp' \\\n", "trainer.precision=16 \\\n", - "exp_manager.exp_dir={logsdir} \\\n", + "exp_manager.exp_dir={logs_dir} \\\n", "exp_manager.create_wandb_logger=True \\\n", "exp_manager.wandb_logger_kwargs.name=\"tutorial-HiFiGAN-finetune-multispeaker\" \\\n", "exp_manager.wandb_logger_kwargs.project=\"NeMo\"" @@ -542,19 +525,19 @@ { "cell_type": "code", "execution_count": null, - "id": "f7e2d6ff", + "id": "0e2aaaa0", "metadata": {}, "outputs": [], "source": [ "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX-last.ckpt\n", - "last_checkpoint_dir = sorted(list([i for i in (Path(logsdir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "last_checkpoint_dir = sorted(list([i for i in (Path(logs_dir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(list(last_checkpoint_dir.glob('*-last.ckpt'))[0])\n", "finetuned_hifigan_on_multispeaker_checkpoint" ] }, { "cell_type": "markdown", - "id": "91724158", + "id": "c58a7fc8", "metadata": {}, "source": [ "# 3. Inference" @@ -563,7 +546,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bbd21bcf", + "id": "67c63ed1", "metadata": {}, "outputs": [], "source": [ @@ -577,7 +560,7 @@ }, { "cell_type": "markdown", - "id": "2dd18808", + "id": "a15f288c", "metadata": {}, "source": [ "## a. Load Model" @@ -586,7 +569,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32b934e3", + "id": "1d40ea47", "metadata": {}, "outputs": [], "source": [ @@ -596,7 +579,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4bf4c97a", + "id": "e8905a5e", "metadata": {}, "outputs": [], "source": [ @@ -607,7 +590,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9cbab303", + "id": "799c2f1d", "metadata": {}, "outputs": [], "source": [ @@ -617,7 +600,7 @@ }, { "cell_type": "markdown", - "id": "1b88d0d7", + "id": "d8cf4a4d", "metadata": {}, "source": [ "## b. Output Audio" @@ -626,7 +609,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1a2c47fb", + "id": "be5ac403", "metadata": {}, "outputs": [], "source": [ @@ -661,7 +644,7 @@ { "cell_type": "code", "execution_count": null, - "id": "afdca860", + "id": "d519e381", "metadata": {}, "outputs": [], "source": [ @@ -687,7 +670,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b45ba3af", + "id": "552b543b", "metadata": {}, "outputs": [], "source": [ @@ -722,7 +705,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6aa522c5", + "id": "0995a836", "metadata": {}, "outputs": [], "source": [ @@ -733,7 +716,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d676763a", + "id": "ebb8d6c4", "metadata": {}, "outputs": [], "source": [] From e50a60312b6136dffca16f55fa2082c71b845e40 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Thu, 27 Apr 2023 14:08:53 -0700 Subject: [PATCH 14/25] Use .nemo Signed-off-by: hsiehjackson --- .../conf/fastpitch_align_44100_adapter.yaml | 10 +- .../tts/FastPitch_Adapter_Finetuning.ipynb | 159 +++++++++--------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 124 +++++++------- 3 files changed, 143 insertions(+), 150 deletions(-) diff --git a/examples/tts/conf/fastpitch_align_44100_adapter.yaml b/examples/tts/conf/fastpitch_align_44100_adapter.yaml index bac6a64b06e9..8a5ea17ef22d 100644 --- a/examples/tts/conf/fastpitch_align_44100_adapter.yaml +++ b/examples/tts/conf/fastpitch_align_44100_adapter.yaml @@ -208,7 +208,7 @@ model: dropatt: 0.1 dropemb: 0.0 d_embed: ${model.symbols_embedding_dim} - condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] + condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] output_fft: _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder @@ -221,12 +221,12 @@ model: dropout: 0.1 dropatt: 0.1 dropemb: 0.0 - condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] + condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] alignment_module: _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder n_text_channels: ${model.symbols_embedding_dim} - condition_types: [ "add" ] # options: [ "add", "cat" ] + condition_types: [ "add" ] # options: [ "add", "concat" ] duration_predictor: _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor @@ -235,7 +235,7 @@ model: filter_size: 256 dropout: 0.1 n_layers: 2 - condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] + condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] pitch_predictor: _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor @@ -244,7 +244,7 @@ model: filter_size: 256 dropout: 0.1 n_layers: 2 - condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] + condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] speaker_encoder: _target_: nemo.collections.tts.modules.submodules.SpeakerEncoder diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 5a02eed23fdf..7889e5d646ea 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "1e065b5e", + "id": "ef81c313", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "89eb3a1f", + "id": "8c72971e", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "95ccd2a1", + "id": "fce36784", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "96c49a4c", + "id": "eed8181f", "metadata": {}, "outputs": [], "source": [ @@ -83,10 +83,11 @@ { "cell_type": "code", "execution_count": null, - "id": "6037a349", + "id": "a697e184", "metadata": {}, "outputs": [], "source": [ + "# .nemo files for your pre-trained FastPitch and HiFiGAN\n", "pretrained_fastpitch_checkpoint = \"\"\n", "finetuned_hifigan_on_multispeaker_checkpoint = \"\"" ] @@ -94,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "131451f0", + "id": "af1a9431", "metadata": {}, "outputs": [], "source": [ @@ -112,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8d033777", + "id": "51d5bdb4", "metadata": {}, "outputs": [], "source": [ @@ -130,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1c71085a", + "id": "cb2ed8da", "metadata": {}, "outputs": [], "source": [ @@ -148,7 +149,7 @@ }, { "cell_type": "markdown", - "id": "e52a7746", + "id": "60afc9e1", "metadata": {}, "source": [ "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" @@ -157,10 +158,11 @@ { "cell_type": "code", "execution_count": null, - "id": "f318ea3a", + "id": "682b279e", "metadata": {}, "outputs": [], "source": [ + "from nemo.collections.tts.models import FastPitchModel\n", "from nemo.core import adapter_mixins\n", "from omegaconf import DictConfig, OmegaConf, open_dict" ] @@ -168,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b4c9f660", + "id": "88654e4b", "metadata": {}, "outputs": [], "source": [ @@ -200,31 +202,25 @@ { "cell_type": "code", "execution_count": null, - "id": "59e5d4e5", - "metadata": {}, - "outputs": [], - "source": [ - "state = torch.load(pretrained_fastpitch_checkpoint)\n", - "state['hyper_parameters']['cfg'] = update_model_config_to_support_adapter(state['hyper_parameters']['cfg'])\n", - "torch.save(state, pretrained_fastpitch_checkpoint)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45ba24e2", + "id": "1f11a628", "metadata": {}, "outputs": [], "source": [ - "shutil.copyfile(pretrained_fastpitch_checkpoint, \"FastPitch.ckpt\")\n", - "shutil.copyfile(finetuned_hifigan_on_multispeaker_checkpoint, \"HifiGan.ckpt\")\n", - "pretrained_fastpitch_checkpoint = os.path.abspath(\"FastPitch.ckpt\")\n", - "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(\"HifiGan.ckpt\")" + "model = FastPitchModel.restore_from(pretrained_fastpitch_checkpoint)\n", + "model.cfg = update_model_config_to_support_adapter(model.cfg)\n", + "model.save_to('Pretrained-FastPitch.nemo')\n", + "shutil.copyfile(finetuned_hifigan_on_multispeaker_checkpoint, \"Pretrained-HifiGan.nemo\")\n", + "\n", + "pretrained_fastpitch_checkpoint = os.path.abspath(\"Pretrained-FastPitch.nemo\")\n", + "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(\"Pretrained-HifiGan.nemo\")\n", + "# state = torch.load(pretrained_fastpitch_checkpoint)\n", + "# state['hyper_parameters']['cfg'] = update_model_config_to_support_adapter(state['hyper_parameters']['cfg'])\n", + "# torch.save(state, pretrained_fastpitch_checkpoint)" ] }, { "cell_type": "markdown", - "id": "f7be0c03", + "id": "6879a60c", "metadata": {}, "source": [ "# 2. Fine-tune FastPitch on adaptation data" @@ -232,7 +228,7 @@ }, { "cell_type": "markdown", - "id": "e3d6e3e3", + "id": "207c7f02", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -242,7 +238,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6a92c174", + "id": "9a7a55ba", "metadata": {}, "outputs": [], "source": [ @@ -252,7 +248,7 @@ { "cell_type": "code", "execution_count": null, - "id": "119970aa", + "id": "3c32a4e5", "metadata": {}, "outputs": [], "source": [ @@ -263,7 +259,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d46a308", + "id": "fab9f979", "metadata": {}, "outputs": [], "source": [ @@ -273,7 +269,7 @@ }, { "cell_type": "markdown", - "id": "8540d872", + "id": "c1df877a", "metadata": {}, "source": [ "## b. Preprocessing" @@ -281,7 +277,7 @@ }, { "cell_type": "markdown", - "id": "ea07ff33", + "id": "af34a373", "metadata": {}, "source": [ "### Add absolute file path in manifest\n", @@ -291,7 +287,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98bb7ab6", + "id": "eb43d862", "metadata": {}, "outputs": [], "source": [ @@ -301,7 +297,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf7512c9", + "id": "4dcdb0e3", "metadata": {}, "outputs": [], "source": [ @@ -316,7 +312,7 @@ }, { "cell_type": "markdown", - "id": "50d529ea", + "id": "93c0f45a", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -329,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca883965", + "id": "87661e4c", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +340,7 @@ }, { "cell_type": "markdown", - "id": "dd5671bd", + "id": "a939bde2", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -357,7 +353,7 @@ { "cell_type": "code", "execution_count": null, - "id": "820b6fa2", + "id": "332c8be8", "metadata": {}, "outputs": [], "source": [ @@ -372,7 +368,7 @@ }, { "cell_type": "markdown", - "id": "aac60eb0", + "id": "ed26e91b", "metadata": {}, "source": [ "## c. Training" @@ -381,7 +377,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19c47142", + "id": "80cb0e92", "metadata": {}, "outputs": [], "source": [ @@ -395,7 +391,7 @@ }, { "cell_type": "markdown", - "id": "6bfc7c70", + "id": "0d599575", "metadata": {}, "source": [ "### Important notes\n", @@ -410,14 +406,14 @@ { "cell_type": "code", "execution_count": null, - "id": "7e5ee24e", + "id": "17638cb9", "metadata": {}, "outputs": [], "source": [ "# Normally 100 epochs\n", "!cd {code_dir} && python examples/tts/fastpitch_finetune_adapters.py \\\n", "--config-name=fastpitch_align_44100_adapter.yaml \\\n", - "+init_from_ptl_ckpt={pretrained_fastpitch_checkpoint} \\\n", + "+init_from_nemo_model={pretrained_fastpitch_checkpoint} \\\n", "train_dataset={train_manifest} \\\n", "validation_datasets={valid_manifest} \\\n", "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id', 'reference_audio']\" \\\n", @@ -446,11 +442,11 @@ { "cell_type": "code", "execution_count": null, - "id": "8a298487", + "id": "402ae669", "metadata": {}, "outputs": [], "source": [ - "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch--val_loss=XXX-epoch=XXX-last.ckpt\n", + "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/adapters.pt\n", "last_checkpoint_dir = sorted(list([i for i in (Path(logs_dir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", "finetuned_adapter_checkpoint = list(last_checkpoint_dir.glob('adapters.pt'))[0]\n", "print(finetuned_adapter_checkpoint)" @@ -458,7 +454,7 @@ }, { "cell_type": "markdown", - "id": "1f9cebb4", + "id": "33dcbc8f", "metadata": {}, "source": [ "# 4. Fine-tune HiFiGAN on adaptation data" @@ -466,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "bb1a64bb", + "id": "37090445", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -476,13 +472,12 @@ { "cell_type": "code", "execution_count": null, - "id": "d443cbf5", + "id": "ed43ef5a", "metadata": {}, "outputs": [], "source": [ "from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator\n", "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", - "from nemo.collections.tts.models import FastPitchModel\n", "from collections import defaultdict\n", "import random\n", "random.seed(100)" @@ -491,7 +486,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fc742c61", + "id": "c45efcd5", "metadata": {}, "outputs": [], "source": [ @@ -548,14 +543,14 @@ { "cell_type": "code", "execution_count": null, - "id": "d1ebcc29", + "id": "5ec8352b", "metadata": {}, "outputs": [], "source": [ "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", "\n", "# Pretrained FastPitch Weights\n", - "spec_model = FastPitchModel.load_from_checkpoint(pretrained_fastpitch_checkpoint)\n", + "spec_model = FastPitchModel.restore_from(pretrained_fastpitch_checkpoint)\n", "\n", "# Load Adapter Weights\n", "spec_model.load_adapters(finetuned_adapter_checkpoint)\n", @@ -567,7 +562,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4db5892a", + "id": "ebde1095", "metadata": {}, "outputs": [], "source": [ @@ -599,7 +594,7 @@ }, { "cell_type": "markdown", - "id": "daea80ef", + "id": "1fa78c31", "metadata": {}, "source": [ "## b. Training" @@ -608,7 +603,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e101e6b", + "id": "a131a743", "metadata": {}, "outputs": [], "source": [ @@ -617,7 +612,7 @@ "--config-name=hifigan_44100.yaml \\\n", "train_dataset={train_manifest} \\\n", "validation_datasets={valid_manifest} \\\n", - "+init_from_ptl_ckpt={finetuned_hifigan_on_multispeaker_checkpoint} \\\n", + "+init_from_nemo_model={finetuned_hifigan_on_multispeaker_checkpoint} \\\n", "model.train_ds.dataloader_params.batch_size=32 \\\n", "model.optim.lr=0.0001 \\\n", "model/train_ds=train_ds_finetune \\\n", @@ -636,19 +631,19 @@ { "cell_type": "code", "execution_count": null, - "id": "0bb975e0", + "id": "3677e57e", "metadata": {}, "outputs": [], "source": [ - "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX.ckpt\n", + "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan.nemo\n", "last_checkpoint_dir = sorted(list([i for i in (Path(logs_dir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", - "finetuned_hifigan_on_adaptation_checkpoint = list(last_checkpoint_dir.glob('*-last.ckpt'))[0]\n", + "finetuned_hifigan_on_adaptation_checkpoint = list(last_checkpoint_dir.glob('*.nemo'))[0]\n", "finetuned_hifigan_on_adaptation_checkpoint" ] }, { "cell_type": "markdown", - "id": "9e12f211", + "id": "6fd425ef", "metadata": {}, "source": [ "# 3. Inference" @@ -657,7 +652,7 @@ { "cell_type": "code", "execution_count": null, - "id": "86c35868", + "id": "2de65974", "metadata": {}, "outputs": [], "source": [ @@ -668,7 +663,7 @@ }, { "cell_type": "markdown", - "id": "626525b0", + "id": "981087a2", "metadata": {}, "source": [ "## a. Load Model" @@ -677,7 +672,7 @@ { "cell_type": "code", "execution_count": null, - "id": "805b4909", + "id": "9be586e8", "metadata": {}, "outputs": [], "source": [ @@ -687,12 +682,12 @@ { "cell_type": "code", "execution_count": null, - "id": "17352083", + "id": "f5b0153e", "metadata": {}, "outputs": [], "source": [ "# FastPitch\n", - "spec_model = FastPitchModel.load_from_checkpoint(pretrained_fastpitch_checkpoint)\n", + "spec_model = FastPitchModel.restore_from(pretrained_fastpitch_checkpoint)\n", "spec_model.load_adapters(finetuned_adapter_checkpoint)\n", "spec_model = spec_model.eval().cuda()" ] @@ -700,17 +695,17 @@ { "cell_type": "code", "execution_count": null, - "id": "d6c36a10", + "id": "7ab16ed8", "metadata": {}, "outputs": [], "source": [ "# HiFiGAN\n", - "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=finetuned_hifigan_on_adaptation_checkpoint).eval().cuda()" + "vocoder_model = HifiGanModel.restore_from(finetuned_hifigan_on_adaptation_checkpoint).eval().cuda()" ] }, { "cell_type": "markdown", - "id": "8a71d4d7", + "id": "dcf26212", "metadata": {}, "source": [ "## b. Output Audio" @@ -719,7 +714,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eea31431", + "id": "8b6f638f", "metadata": {}, "outputs": [], "source": [ @@ -735,7 +730,7 @@ "def gen_spectrogram(text, spec_gen_model, reference_spec, reference_spec_lens):\n", " parsed = spec_gen_model.parse(text)\n", " with torch.no_grad(): \n", - " spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, \n", + " spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, \n", " reference_spec=reference_spec, \n", " reference_spec_lens=reference_spec_lens)\n", "\n", @@ -752,7 +747,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fb0437a3", + "id": "79f7a094", "metadata": {}, "outputs": [], "source": [ @@ -775,7 +770,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e6ddd463", + "id": "a9df77ac", "metadata": {}, "outputs": [], "source": [ @@ -807,7 +802,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e3342281", + "id": "b255ba13", "metadata": {}, "outputs": [], "source": [ @@ -820,7 +815,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59bd98b4", + "id": "8699a038", "metadata": {}, "outputs": [], "source": [ @@ -832,13 +827,21 @@ { "cell_type": "code", "execution_count": null, - "id": "6a65f215", + "id": "9e656010", "metadata": {}, "outputs": [], "source": [ "print(f\"FastPitch nemo file: {fintuned_fastpitch}\")\n", "print(f\"HiFi-Gan nemo file: {fintuned_hifigan}\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e4e710f", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index 1c8302400b42..4d58a1a65984 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "8318580f", + "id": "37c81873", "metadata": {}, "source": [ "# FastPitch MultiSpeaker Pretraining\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "45f9b2a8", + "id": "b5c8e509", "metadata": {}, "source": [ "# License\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2a06cd68", + "id": "c49668b5", "metadata": {}, "outputs": [], "source": [ @@ -71,7 +71,7 @@ { "cell_type": "code", "execution_count": null, - "id": "559da21f", + "id": "c380267f", "metadata": {}, "outputs": [], "source": [ @@ -81,7 +81,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef94a00e", + "id": "f8aaa707", "metadata": {}, "outputs": [], "source": [ @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ce5629c0", + "id": "7ddce580", "metadata": {}, "outputs": [], "source": [ @@ -116,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "efe59789", + "id": "52dfe506", "metadata": {}, "outputs": [], "source": [ @@ -134,7 +134,7 @@ }, { "cell_type": "markdown", - "id": "41610544", + "id": "748a7ec4", "metadata": {}, "source": [ "# 1. Pre-train FastPitch on multi-speaker data" @@ -142,7 +142,7 @@ }, { "cell_type": "markdown", - "id": "b47540d5", + "id": "ec7f0675", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -153,7 +153,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9bd53b0b", + "id": "d35cbeaa", "metadata": {}, "outputs": [], "source": [ @@ -163,7 +163,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3d3190bd", + "id": "7335c3dc", "metadata": {}, "outputs": [], "source": [ @@ -174,7 +174,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eb0e390f", + "id": "40e909e8", "metadata": {}, "outputs": [], "source": [ @@ -184,7 +184,7 @@ }, { "cell_type": "markdown", - "id": "c86dd428", + "id": "a0e872e5", "metadata": {}, "source": [ "## b. Preprocessing" @@ -192,7 +192,7 @@ }, { "cell_type": "markdown", - "id": "747ab8ac", + "id": "5f15d8ab", "metadata": {}, "source": [ "### Add absoluate audio path in manifest\n", @@ -202,7 +202,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d5e8fc8", + "id": "5b53ba5b", "metadata": {}, "outputs": [], "source": [ @@ -212,7 +212,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5883afc8", + "id": "3c18263f", "metadata": {}, "outputs": [], "source": [ @@ -227,7 +227,7 @@ }, { "cell_type": "markdown", - "id": "4d2d38a2", + "id": "93d4c50b", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0\n", @@ -237,7 +237,7 @@ { "cell_type": "code", "execution_count": null, - "id": "675fd58c", + "id": "b685c82a", "metadata": {}, "outputs": [], "source": [ @@ -253,7 +253,7 @@ }, { "cell_type": "markdown", - "id": "81eeb8d8", + "id": "c2a6f14a", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -266,7 +266,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ff88cf4", + "id": "8ce575e7", "metadata": {}, "outputs": [], "source": [ @@ -281,7 +281,7 @@ }, { "cell_type": "markdown", - "id": "e3b662a6", + "id": "d514c61a", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f6429c82", + "id": "daad5e1e", "metadata": {}, "outputs": [], "source": [ @@ -309,7 +309,7 @@ }, { "cell_type": "markdown", - "id": "d5ea1ac3", + "id": "93ecb419", "metadata": {}, "source": [ "* If you want to compute pitch mean and std for each speaker, you can use the script `compute_speaker_stats.py`\n", @@ -323,7 +323,7 @@ }, { "cell_type": "markdown", - "id": "a2b3c2a1", + "id": "68bd8bbb", "metadata": {}, "source": [ "## c. Training" @@ -332,7 +332,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6d5efb0", + "id": "183a5c8b", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ }, { "cell_type": "markdown", - "id": "3b3f64bc", + "id": "bf6ec2f2", "metadata": {}, "source": [ "### Important notes\n", @@ -378,7 +378,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f55c6ec8", + "id": "4826ecc9", "metadata": {}, "outputs": [], "source": [ @@ -421,21 +421,19 @@ { "cell_type": "code", "execution_count": null, - "id": "44a24d11", + "id": "51e99525", "metadata": {}, "outputs": [], "source": [ - "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch--val_loss=XXX-epoch=XXX-last.ckpt\n", + "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch.nemo\n", "last_checkpoint_dir = sorted(list([i for i in (Path(logs_dir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", - "pretrained_fastpitch_checkpoint_ckpt = os.path.abspath(list(last_checkpoint_dir.glob('*-last.ckpt'))[0])\n", - "print(pretrained_fastpitch_checkpoint_ckpt)\n", - "pretrained_fastpitch_checkpoint_nemo = os.path.abspath(list(last_checkpoint_dir.glob('*.nemo'))[0])\n", - "print(pretrained_fastpitch_checkpoint_nemo)" + "pretrained_fastpitch_checkpoint = os.path.abspath(list(last_checkpoint_dir.glob('*.nemo'))[0])\n", + "print(pretrained_fastpitch_checkpoint)" ] }, { "cell_type": "markdown", - "id": "16ce3e6f", + "id": "870acf5a", "metadata": {}, "source": [ "# 2. Fine-tune HiFiGAN on multi-speaker data" @@ -443,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "b399346e", + "id": "05a47e2c", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -452,13 +450,13 @@ { "cell_type": "code", "execution_count": null, - "id": "fbef01c5", + "id": "3d851795", "metadata": {}, "outputs": [], "source": [ "!cd {code_dir} \\\n", "&& python scripts/dataset_processing/tts/resynthesize_dataset.py \\\n", - "--model-path={pretrained_fastpitch_checkpoint_nemo} \\\n", + "--model-path={pretrained_fastpitch_checkpoint} \\\n", "--input-json-manifest={train_manifest} \\\n", "--input-sup-data-path={supp_dir} \\\n", "--output-folder={mels_dir} \\\n", @@ -466,7 +464,7 @@ "--batch-size=1 \\\n", "--num-workers=1 \\\n", "&& python scripts/dataset_processing/tts/resynthesize_dataset.py \\\n", - "--model-path={pretrained_fastpitch_checkpoint_nemo} \\\n", + "--model-path={pretrained_fastpitch_checkpoint} \\\n", "--input-json-manifest={valid_manifest} \\\n", "--input-sup-data-path={supp_dir} \\\n", "--output-folder={mels_dir} \\\n", @@ -478,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e99d9af9", + "id": "a430cd28", "metadata": {}, "outputs": [], "source": [ @@ -488,7 +486,7 @@ }, { "cell_type": "markdown", - "id": "1cac303b", + "id": "84e79334", "metadata": {}, "source": [ "## b. Training" @@ -497,7 +495,7 @@ { "cell_type": "code", "execution_count": null, - "id": "05037d1f", + "id": "92d9562f", "metadata": {}, "outputs": [], "source": [ @@ -513,7 +511,7 @@ "model/validation_ds=val_ds_finetune \\\n", "+trainer.max_epochs=5 \\\n", "trainer.check_val_every_n_epoch=5 \\\n", - "trainer.devices=-1 \\\n", + "trainer.devices=1 \\\n", "trainer.strategy='ddp' \\\n", "trainer.precision=16 \\\n", "exp_manager.exp_dir={logs_dir} \\\n", @@ -525,19 +523,19 @@ { "cell_type": "code", "execution_count": null, - "id": "0e2aaaa0", + "id": "8e82bc3a", "metadata": {}, "outputs": [], "source": [ - "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan--val_loss=XXX-epoch=XXX-last.ckpt\n", + "# e.g. NeMoTTS_logs/HifiGan/Y-M-D_H-M-S/checkpoints/HifiGan.nemo\n", "last_checkpoint_dir = sorted(list([i for i in (Path(logs_dir) / \"HifiGan\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", - "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(list(last_checkpoint_dir.glob('*-last.ckpt'))[0])\n", + "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(list(last_checkpoint_dir.glob('*.nemo'))[0])\n", "finetuned_hifigan_on_multispeaker_checkpoint" ] }, { "cell_type": "markdown", - "id": "c58a7fc8", + "id": "e7da2068", "metadata": {}, "source": [ "# 3. Inference" @@ -546,7 +544,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67c63ed1", + "id": "3c71f05f", "metadata": {}, "outputs": [], "source": [ @@ -560,7 +558,7 @@ }, { "cell_type": "markdown", - "id": "a15f288c", + "id": "d0575000", "metadata": {}, "source": [ "## a. Load Model" @@ -569,7 +567,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1d40ea47", + "id": "b35cd5f9", "metadata": {}, "outputs": [], "source": [ @@ -579,28 +577,28 @@ { "cell_type": "code", "execution_count": null, - "id": "e8905a5e", + "id": "bb9f7f89", "metadata": {}, "outputs": [], "source": [ "# FastPitch\n", - "spec_model = FastPitchModel.load_from_checkpoint(pretrained_fastpitch_checkpoint_ckpt).eval().cuda()" + "spec_model = FastPitchModel.restore_from(pretrained_fastpitch_checkpoint).eval().cuda()" ] }, { "cell_type": "code", "execution_count": null, - "id": "799c2f1d", + "id": "8fd30da0", "metadata": {}, "outputs": [], "source": [ "# HiFiGAN\n", - "vocoder_model = HifiGanModel.load_from_checkpoint(checkpoint_path=finetuned_hifigan_on_multispeaker_checkpoint).eval().cuda()" + "vocoder_model = HifiGanModel.restore_from(finetuned_hifigan_on_multispeaker_checkpoint).eval().cuda()" ] }, { "cell_type": "markdown", - "id": "d8cf4a4d", + "id": "af905e0c", "metadata": {}, "source": [ "## b. Output Audio" @@ -609,7 +607,7 @@ { "cell_type": "code", "execution_count": null, - "id": "be5ac403", + "id": "fb19b665", "metadata": {}, "outputs": [], "source": [ @@ -644,7 +642,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d519e381", + "id": "13b4d317", "metadata": {}, "outputs": [], "source": [ @@ -670,7 +668,7 @@ { "cell_type": "code", "execution_count": null, - "id": "552b543b", + "id": "624419ed", "metadata": {}, "outputs": [], "source": [ @@ -705,21 +703,13 @@ { "cell_type": "code", "execution_count": null, - "id": "0995a836", + "id": "dae80745", "metadata": {}, "outputs": [], "source": [ - "print(f\"FastPitch checkpoint: {pretrained_fastpitch_checkpoint_ckpt}\")\n", + "print(f\"FastPitch checkpoint: {pretrained_fastpitch_checkpoint}\")\n", "print(f\"HiFi-Gan checkpoint: {finetuned_hifigan_on_multispeaker_checkpoint}\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebb8d6c4", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From b58c6774234b6f953288793be46f82940746fd15 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Thu, 27 Apr 2023 17:34:10 -0700 Subject: [PATCH 15/25] Follow Comments Signed-off-by: hsiehjackson --- .../tts/resynthesize_dataset.py | 1 - .../tts/FastPitch_Adapter_Finetuning.ipynb | 126 +++++++++--------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 96 +++++++------ 3 files changed, 109 insertions(+), 114 deletions(-) diff --git a/scripts/dataset_processing/tts/resynthesize_dataset.py b/scripts/dataset_processing/tts/resynthesize_dataset.py index cebccc882b27..652fde299572 100644 --- a/scripts/dataset_processing/tts/resynthesize_dataset.py +++ b/scripts/dataset_processing/tts/resynthesize_dataset.py @@ -80,7 +80,6 @@ def chunks(iterable: Iterable, size: int) -> Iterator[List]: def load_model(path: Path, device: torch.device) -> SpectrogramGenerator: - print(path) model = None if path.suffix == ".nemo": model = SpectrogramGenerator.restore_from(path, map_location=device) diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 7889e5d646ea..10f5481be95a 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ef81c313", + "id": "646925fb", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "8c72971e", + "id": "1cb5bf85", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fce36784", + "id": "64d0a1dc", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eed8181f", + "id": "8ed7f938", "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a697e184", + "id": "3f63ad0d", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "af1a9431", + "id": "bf9b2e6d", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51d5bdb4", + "id": "ce31578e", "metadata": {}, "outputs": [], "source": [ @@ -131,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cb2ed8da", + "id": "f8e5cbc4", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ }, { "cell_type": "markdown", - "id": "60afc9e1", + "id": "a198a651", "metadata": {}, "source": [ "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" @@ -158,7 +158,7 @@ { "cell_type": "code", "execution_count": null, - "id": "682b279e", + "id": "9bc36276", "metadata": {}, "outputs": [], "source": [ @@ -170,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88654e4b", + "id": "b3a85cb0", "metadata": {}, "outputs": [], "source": [ @@ -202,7 +202,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f11a628", + "id": "8952410e", "metadata": {}, "outputs": [], "source": [ @@ -212,15 +212,12 @@ "shutil.copyfile(finetuned_hifigan_on_multispeaker_checkpoint, \"Pretrained-HifiGan.nemo\")\n", "\n", "pretrained_fastpitch_checkpoint = os.path.abspath(\"Pretrained-FastPitch.nemo\")\n", - "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(\"Pretrained-HifiGan.nemo\")\n", - "# state = torch.load(pretrained_fastpitch_checkpoint)\n", - "# state['hyper_parameters']['cfg'] = update_model_config_to_support_adapter(state['hyper_parameters']['cfg'])\n", - "# torch.save(state, pretrained_fastpitch_checkpoint)" + "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(\"Pretrained-HifiGan.nemo\")" ] }, { "cell_type": "markdown", - "id": "6879a60c", + "id": "19c33aab", "metadata": {}, "source": [ "# 2. Fine-tune FastPitch on adaptation data" @@ -228,7 +225,7 @@ }, { "cell_type": "markdown", - "id": "207c7f02", + "id": "d01c7d22", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -238,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9a7a55ba", + "id": "33c526aa", "metadata": {}, "outputs": [], "source": [ @@ -248,7 +245,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c32a4e5", + "id": "6e00251f", "metadata": {}, "outputs": [], "source": [ @@ -259,7 +256,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fab9f979", + "id": "4366aefc", "metadata": {}, "outputs": [], "source": [ @@ -269,7 +266,7 @@ }, { "cell_type": "markdown", - "id": "c1df877a", + "id": "50940938", "metadata": {}, "source": [ "## b. Preprocessing" @@ -277,7 +274,7 @@ }, { "cell_type": "markdown", - "id": "af34a373", + "id": "6f51e8c3", "metadata": {}, "source": [ "### Add absolute file path in manifest\n", @@ -287,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eb43d862", + "id": "7c7fd91d", "metadata": {}, "outputs": [], "source": [ @@ -297,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4dcdb0e3", + "id": "45a97208", "metadata": {}, "outputs": [], "source": [ @@ -312,20 +309,18 @@ }, { "cell_type": "markdown", - "id": "93c0f45a", + "id": "94ce06e4", "metadata": {}, "source": [ "### Extract Supplementary Data\n", "\n", - "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script.\n", - "\n", - "Note: This is an optional step, if skipped, it will be automatically executed within the first epoch of training FastPitch." + "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script." ] }, { "cell_type": "code", "execution_count": null, - "id": "87661e4c", + "id": "47594938", "metadata": {}, "outputs": [], "source": [ @@ -340,7 +335,7 @@ }, { "cell_type": "markdown", - "id": "a939bde2", + "id": "826bb1d1", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -353,7 +348,7 @@ { "cell_type": "code", "execution_count": null, - "id": "332c8be8", + "id": "900f7d42", "metadata": {}, "outputs": [], "source": [ @@ -368,7 +363,7 @@ }, { "cell_type": "markdown", - "id": "ed26e91b", + "id": "f03c022e", "metadata": {}, "source": [ "## c. Training" @@ -377,7 +372,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80cb0e92", + "id": "8a5ad291", "metadata": {}, "outputs": [], "source": [ @@ -391,7 +386,7 @@ }, { "cell_type": "markdown", - "id": "0d599575", + "id": "0b9e8f8e", "metadata": {}, "source": [ "### Important notes\n", @@ -406,7 +401,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17638cb9", + "id": "24fd1e9a", "metadata": {}, "outputs": [], "source": [ @@ -442,7 +437,7 @@ { "cell_type": "code", "execution_count": null, - "id": "402ae669", + "id": "22936e1c", "metadata": {}, "outputs": [], "source": [ @@ -454,25 +449,29 @@ }, { "cell_type": "markdown", - "id": "33dcbc8f", + "id": "46ff1669", "metadata": {}, "source": [ - "# 4. Fine-tune HiFiGAN on adaptation data" + "# 3. Fine-tune HiFiGAN on adaptation data" ] }, { "cell_type": "markdown", - "id": "37090445", + "id": "4b16bde0", "metadata": {}, "source": [ "## a. Dataset Preparation\n", - "Generate mel-spectrograms for HiFiGAN training." + "Generate mel-spectrograms for HiFiGAN training.\n", + "### Important notes\n", + "* We don't use existing scripts (e.g. `resynthesize_dataset.py` or `generate_mels.py`) \n", + "* We may need `reference spectrogram` or `speaker` as the input of FastPitch.\n", + "* We call `load_adapters` to resume our FastPitch" ] }, { "cell_type": "code", "execution_count": null, - "id": "ed43ef5a", + "id": "6f4ed978", "metadata": {}, "outputs": [], "source": [ @@ -486,7 +485,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c45efcd5", + "id": "9b21631f", "metadata": {}, "outputs": [], "source": [ @@ -523,7 +522,6 @@ " reference_audio_length = torch.tensor(reference_audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", " reference_spec, reference_spec_len = spec_model.preprocessor(input_signal=reference_audio, length=reference_audio_length) \n", " \n", - " \n", " with torch.no_grad():\n", " spectrogram = spec_model.forward(\n", " text=text, \n", @@ -543,7 +541,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5ec8352b", + "id": "eeefd908", "metadata": {}, "outputs": [], "source": [ @@ -562,7 +560,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ebde1095", + "id": "83b1705c", "metadata": {}, "outputs": [], "source": [ @@ -594,7 +592,7 @@ }, { "cell_type": "markdown", - "id": "1fa78c31", + "id": "9044e5d8", "metadata": {}, "source": [ "## b. Training" @@ -603,7 +601,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a131a743", + "id": "f72d469b", "metadata": {}, "outputs": [], "source": [ @@ -631,7 +629,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3677e57e", + "id": "465fe036", "metadata": {}, "outputs": [], "source": [ @@ -643,16 +641,16 @@ }, { "cell_type": "markdown", - "id": "6fd425ef", + "id": "54f7ae0b", "metadata": {}, "source": [ - "# 3. Inference" + "# 4. Inference" ] }, { "cell_type": "code", "execution_count": null, - "id": "2de65974", + "id": "4edc2f56", "metadata": {}, "outputs": [], "source": [ @@ -663,7 +661,7 @@ }, { "cell_type": "markdown", - "id": "981087a2", + "id": "92a88dab", "metadata": {}, "source": [ "## a. Load Model" @@ -672,7 +670,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9be586e8", + "id": "460ad165", "metadata": {}, "outputs": [], "source": [ @@ -682,7 +680,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f5b0153e", + "id": "4b4f8c61", "metadata": {}, "outputs": [], "source": [ @@ -695,7 +693,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7ab16ed8", + "id": "568a5e3c", "metadata": {}, "outputs": [], "source": [ @@ -705,7 +703,7 @@ }, { "cell_type": "markdown", - "id": "dcf26212", + "id": "ded03c63", "metadata": {}, "source": [ "## b. Output Audio" @@ -714,7 +712,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8b6f638f", + "id": "f2c06bd4", "metadata": {}, "outputs": [], "source": [ @@ -747,7 +745,7 @@ { "cell_type": "code", "execution_count": null, - "id": "79f7a094", + "id": "1b876880", "metadata": {}, "outputs": [], "source": [ @@ -770,7 +768,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a9df77ac", + "id": "59a4a4bb", "metadata": {}, "outputs": [], "source": [ @@ -802,7 +800,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b255ba13", + "id": "db0c6830", "metadata": {}, "outputs": [], "source": [ @@ -815,7 +813,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8699a038", + "id": "0d022577", "metadata": {}, "outputs": [], "source": [ @@ -827,7 +825,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e656010", + "id": "2e8b30f5", "metadata": {}, "outputs": [], "source": [ @@ -838,7 +836,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0e4e710f", + "id": "7f61605c", "metadata": {}, "outputs": [], "source": [] diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index 4d58a1a65984..e0edbde2f845 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "37c81873", + "id": "3dd6dd51", "metadata": {}, "source": [ "# FastPitch MultiSpeaker Pretraining\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "b5c8e509", + "id": "915d3200", "metadata": {}, "source": [ "# License\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c49668b5", + "id": "e0bd81d8", "metadata": {}, "outputs": [], "source": [ @@ -71,7 +71,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c380267f", + "id": "2cf04110", "metadata": {}, "outputs": [], "source": [ @@ -81,7 +81,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f8aaa707", + "id": "dc30b714", "metadata": {}, "outputs": [], "source": [ @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7ddce580", + "id": "a582efd1", "metadata": {}, "outputs": [], "source": [ @@ -116,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52dfe506", + "id": "e1c5e7a0", "metadata": {}, "outputs": [], "source": [ @@ -134,7 +134,7 @@ }, { "cell_type": "markdown", - "id": "748a7ec4", + "id": "52dcee31", "metadata": {}, "source": [ "# 1. Pre-train FastPitch on multi-speaker data" @@ -142,7 +142,7 @@ }, { "cell_type": "markdown", - "id": "ec7f0675", + "id": "068a6ff0", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -153,7 +153,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d35cbeaa", + "id": "155329c5", "metadata": {}, "outputs": [], "source": [ @@ -163,7 +163,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7335c3dc", + "id": "de97ae34", "metadata": {}, "outputs": [], "source": [ @@ -174,7 +174,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40e909e8", + "id": "571eb693", "metadata": {}, "outputs": [], "source": [ @@ -184,7 +184,7 @@ }, { "cell_type": "markdown", - "id": "a0e872e5", + "id": "86a56ab6", "metadata": {}, "source": [ "## b. Preprocessing" @@ -192,7 +192,7 @@ }, { "cell_type": "markdown", - "id": "5f15d8ab", + "id": "4f52f928", "metadata": {}, "source": [ "### Add absoluate audio path in manifest\n", @@ -202,7 +202,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5b53ba5b", + "id": "756e4415", "metadata": {}, "outputs": [], "source": [ @@ -212,7 +212,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c18263f", + "id": "86112ae9", "metadata": {}, "outputs": [], "source": [ @@ -227,7 +227,7 @@ }, { "cell_type": "markdown", - "id": "93d4c50b", + "id": "0978c35d", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0\n", @@ -237,7 +237,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b685c82a", + "id": "8bba736b", "metadata": {}, "outputs": [], "source": [ @@ -253,20 +253,18 @@ }, { "cell_type": "markdown", - "id": "c2a6f14a", + "id": "48f9c4ab", "metadata": {}, "source": [ "### Extract Supplementary Data\n", "\n", - "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script.\n", - "\n", - "Note: This is an optional step, if skipped, it will be automatically executed within the first epoch of training FastPitch." + "As mentioned in the [FastPitch and MixerTTS training tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script." ] }, { "cell_type": "code", "execution_count": null, - "id": "8ce575e7", + "id": "c30d68d1", "metadata": {}, "outputs": [], "source": [ @@ -281,7 +279,7 @@ }, { "cell_type": "markdown", - "id": "d514c61a", + "id": "6f42b01a", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -294,7 +292,7 @@ { "cell_type": "code", "execution_count": null, - "id": "daad5e1e", + "id": "ee5193bf", "metadata": {}, "outputs": [], "source": [ @@ -309,7 +307,7 @@ }, { "cell_type": "markdown", - "id": "93ecb419", + "id": "f55e2605", "metadata": {}, "source": [ "* If you want to compute pitch mean and std for each speaker, you can use the script `compute_speaker_stats.py`\n", @@ -323,7 +321,7 @@ }, { "cell_type": "markdown", - "id": "68bd8bbb", + "id": "04ea7c7a", "metadata": {}, "source": [ "## c. Training" @@ -332,7 +330,7 @@ { "cell_type": "code", "execution_count": null, - "id": "183a5c8b", + "id": "254ed2e1", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "bf6ec2f2", + "id": "9b38817f", "metadata": {}, "source": [ "### Important notes\n", @@ -378,7 +376,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4826ecc9", + "id": "4828dbb8", "metadata": {}, "outputs": [], "source": [ @@ -421,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51e99525", + "id": "3f8f8d4d", "metadata": {}, "outputs": [], "source": [ @@ -433,7 +431,7 @@ }, { "cell_type": "markdown", - "id": "870acf5a", + "id": "51c072be", "metadata": {}, "source": [ "# 2. Fine-tune HiFiGAN on multi-speaker data" @@ -441,7 +439,7 @@ }, { "cell_type": "markdown", - "id": "05a47e2c", + "id": "552df656", "metadata": {}, "source": [ "## a. Dataset Preparation" @@ -450,7 +448,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3d851795", + "id": "d697370f", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +474,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a430cd28", + "id": "545056bb", "metadata": {}, "outputs": [], "source": [ @@ -486,7 +484,7 @@ }, { "cell_type": "markdown", - "id": "84e79334", + "id": "91b319d2", "metadata": {}, "source": [ "## b. Training" @@ -495,7 +493,7 @@ { "cell_type": "code", "execution_count": null, - "id": "92d9562f", + "id": "f9f8a7a7", "metadata": {}, "outputs": [], "source": [ @@ -523,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8e82bc3a", + "id": "b9a0fbe1", "metadata": {}, "outputs": [], "source": [ @@ -535,7 +533,7 @@ }, { "cell_type": "markdown", - "id": "e7da2068", + "id": "ce0d1d20", "metadata": {}, "source": [ "# 3. Inference" @@ -544,7 +542,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c71f05f", + "id": "7f4cca59", "metadata": {}, "outputs": [], "source": [ @@ -558,7 +556,7 @@ }, { "cell_type": "markdown", - "id": "d0575000", + "id": "b73658a1", "metadata": {}, "source": [ "## a. Load Model" @@ -567,7 +565,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b35cd5f9", + "id": "7d8e9d78", "metadata": {}, "outputs": [], "source": [ @@ -577,7 +575,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bb9f7f89", + "id": "a25cb651", "metadata": {}, "outputs": [], "source": [ @@ -588,7 +586,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8fd30da0", + "id": "6959280f", "metadata": {}, "outputs": [], "source": [ @@ -598,7 +596,7 @@ }, { "cell_type": "markdown", - "id": "af905e0c", + "id": "598cec4e", "metadata": {}, "source": [ "## b. Output Audio" @@ -607,7 +605,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fb19b665", + "id": "ef4e73c2", "metadata": {}, "outputs": [], "source": [ @@ -642,7 +640,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13b4d317", + "id": "ee925bc5", "metadata": {}, "outputs": [], "source": [ @@ -668,7 +666,7 @@ { "cell_type": "code", "execution_count": null, - "id": "624419ed", + "id": "ed143375", "metadata": {}, "outputs": [], "source": [ @@ -703,7 +701,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dae80745", + "id": "0f40868c", "metadata": {}, "outputs": [], "source": [ From 1ae1250c2d7b5577b986ce158c453413dac3ab03 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Fri, 28 Apr 2023 14:44:44 -0700 Subject: [PATCH 16/25] Fix bug Signed-off-by: hsiehjackson --- scripts/dataset_processing/tts/extract_sup_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dataset_processing/tts/extract_sup_data.py b/scripts/dataset_processing/tts/extract_sup_data.py index b82d3182e3b2..0539ca783ff6 100644 --- a/scripts/dataset_processing/tts/extract_sup_data.py +++ b/scripts/dataset_processing/tts/extract_sup_data.py @@ -50,7 +50,7 @@ def preprocess_ds_for_mixer_tts_x(dataloader): CFG_NAME2FUNC = { "ds_for_fastpitch_align": preprocess_ds_for_fastpitch_align, - "ds_for_mixer_tts": preprocess_ds_for_mixer_tts, + "ds_for_mixer_tts": preprocess_ds_for_fastpitch_align, "ds_for_mixer_tts_x": preprocess_ds_for_mixer_tts_x, } From 0c2cbc0143ec287b20f21514caf84e12aa2d6675 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Fri, 28 Apr 2023 14:47:52 -0700 Subject: [PATCH 17/25] Fix bug Signed-off-by: hsiehjackson --- scripts/dataset_processing/tts/extract_sup_data.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/dataset_processing/tts/extract_sup_data.py b/scripts/dataset_processing/tts/extract_sup_data.py index 0539ca783ff6..bc592c7f51e5 100644 --- a/scripts/dataset_processing/tts/extract_sup_data.py +++ b/scripts/dataset_processing/tts/extract_sup_data.py @@ -31,7 +31,7 @@ def get_pitch_stats(pitch_list): def preprocess_ds_for_fastpitch_align(dataloader): pitch_list = [] for batch in tqdm(dataloader, total=len(dataloader)): - pitches = batch["pitch"] + audios, audio_lengths, tokens, tokens_lengths, align_prior_matrices, pitches, pitches_lengths, *_ = batch pitch = pitches.squeeze(0) pitch_list.append(pitch[pitch != 0]) @@ -41,7 +41,16 @@ def preprocess_ds_for_fastpitch_align(dataloader): def preprocess_ds_for_mixer_tts_x(dataloader): pitch_list = [] for batch in tqdm(dataloader, total=len(dataloader)): - pitches = batch["pitch"] + ( + audios, + audio_lengths, + tokens, + tokens_lengths, + align_prior_matrices, + pitches, + pitches_lengths, + lm_tokens, + ) = batch pitch = pitches.squeeze(0) pitch_list.append(pitch[pitch != 0]) From e71387e1086a17fbef1489717431a16e3c398142 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Fri, 28 Apr 2023 14:49:10 -0700 Subject: [PATCH 18/25] Fix bug Signed-off-by: hsiehjackson --- scripts/dataset_processing/tts/extract_sup_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/dataset_processing/tts/extract_sup_data.py b/scripts/dataset_processing/tts/extract_sup_data.py index bc592c7f51e5..9a5dcc223444 100644 --- a/scripts/dataset_processing/tts/extract_sup_data.py +++ b/scripts/dataset_processing/tts/extract_sup_data.py @@ -51,6 +51,7 @@ def preprocess_ds_for_mixer_tts_x(dataloader): pitches_lengths, lm_tokens, ) = batch + pitch = pitches.squeeze(0) pitch_list.append(pitch[pitch != 0]) From 60555ab9e1afb0a4f31ebfde1e22ce45f503eb17 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Mon, 1 May 2023 19:11:15 -0700 Subject: [PATCH 19/25] Add precomputed speaker emb Signed-off-by: hsiehjackson --- .../conf/fastpitch_align_44100_adapter.yaml | 2 + examples/tts/fastpitch_finetune_adapters.py | 5 - nemo/collections/tts/modules/fastpitch.py | 18 +- nemo/collections/tts/modules/submodules.py | 21 +- .../tts/FastPitch_Adapter_Finetuning.ipynb | 484 ++++++++---------- .../FastPitch_MultiSpeaker_Pretraining.ipynb | 123 ++--- 6 files changed, 316 insertions(+), 337 deletions(-) diff --git a/examples/tts/conf/fastpitch_align_44100_adapter.yaml b/examples/tts/conf/fastpitch_align_44100_adapter.yaml index 8a5ea17ef22d..85f740ab5773 100644 --- a/examples/tts/conf/fastpitch_align_44100_adapter.yaml +++ b/examples/tts/conf/fastpitch_align_44100_adapter.yaml @@ -248,6 +248,8 @@ model: speaker_encoder: _target_: nemo.collections.tts.modules.submodules.SpeakerEncoder + precompute: false + precompute_embedding_dim: ${model.symbols_embedding_dim} lookup_module: _target_: nemo.collections.tts.modules.submodules.SpeakerLookupTable n_speakers: ??? diff --git a/examples/tts/fastpitch_finetune_adapters.py b/examples/tts/fastpitch_finetune_adapters.py index b063a482f018..396552b0f4fd 100644 --- a/examples/tts/fastpitch_finetune_adapters.py +++ b/examples/tts/fastpitch_finetune_adapters.py @@ -103,11 +103,6 @@ def main(cfg): # Freeze model model.freeze() - # Used if we fine-tune with multi-speaker dataset - if model.fastpitch.speaker_encoder is not None and model.fastpitch.speaker_encoder.lookup_module is not None: - for name, param in model.fastpitch.speaker_encoder.lookup_module.named_parameters(): - param.requires_grad = True - # Setup adapters if adapter_global_cfg is not None: add_global_adapter_cfg(model, adapter_global_cfg) diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index e2da672cf9c7..42a70309c74b 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -177,7 +177,7 @@ def __init__( self.learn_alignment = aligner is not None self.use_duration_predictor = True self.binarize = False - + # TODO: combine self.speaker_emb with self.speaker_encoder # cfg: remove `n_speakers`, create `speaker_encoder.lookup_module` # state_dict: move `speaker_emb.weight` to `speaker_encoder.lookup_module.table.weight` @@ -244,10 +244,10 @@ def output_types(self): "energy_tgt": NeuralType(('B', 'T_audio'), RegressionValuesType()), } - def get_speaker_embedding(self, speaker, reference_spec, reference_spec_lens): + def get_speaker_embedding(self, batch_size, speaker, reference_spec, reference_spec_lens): """spk_emb: Bx1xD""" if self.speaker_encoder is not None: - spk_emb = self.speaker_encoder(speaker, reference_spec, reference_spec_lens).unsqueeze(1) + spk_emb = self.speaker_encoder(batch_size, speaker, reference_spec, reference_spec_lens).unsqueeze(1) elif self.speaker_emb is not None: if speaker is None: raise ValueError('Please give speaker id to get lookup speaker embedding.') @@ -281,7 +281,10 @@ def forward( # Calculate speaker embedding spk_emb = self.get_speaker_embedding( - speaker=speaker, reference_spec=reference_spec, reference_spec_lens=reference_spec_lens, + batch_size=text.shape[0], + speaker=speaker, + reference_spec=reference_spec, + reference_spec_lens=reference_spec_lens, ) # Input FFT @@ -379,10 +382,13 @@ def infer( reference_spec=None, reference_spec_lens=None, ): - + # Calculate speaker embedding spk_emb = self.get_speaker_embedding( - speaker=speaker, reference_spec=reference_spec, reference_spec_lens=reference_spec_lens, + batch_size=text.shape[0], + speaker=speaker, + reference_spec=reference_spec, + reference_spec_lens=reference_spec_lens, ) # Input FFT diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index dbf26f1ceeee..655a0a1f126c 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -709,18 +709,29 @@ class SpeakerEncoder represents speakers representation. This module can combine GST (global style token) based speaker embeddings and lookup table speaker embeddings. """ - def __init__(self, lookup_module=None, gst_module=None): + def __init__(self, lookup_module=None, gst_module=None, precompute=False, precompute_embedding_dim=None): """ lookup_module: Torch module to get lookup based speaker embedding gst_module: Neural module to get GST based speaker embedding + precompute: Use precompute speaker embedding """ super(SpeakerEncoder, self).__init__() + + # Multi-speaker embedding self.lookup_module = lookup_module + + # Reference speaker embedding self.gst_module = gst_module - + + if precompute: + self.precomputed_emb = torch.nn.Parameter(torch.empty(precompute_embedding_dim)) + else: + self.register_parameter('precomputed_emb', None) + @property def input_types(self): return { + "batch_size": NeuralType(), "speaker": NeuralType(('B'), Index(), optional=True), "reference_spec": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType(), optional=True), "reference_spec_lens": NeuralType(('B'), LengthsType(), optional=True), @@ -732,8 +743,12 @@ def output_types(self): "embs": NeuralType(('B', 'D'), EncodedRepresentation()), } - def forward(self, speaker=None, reference_spec=None, reference_spec_lens=None): + def forward(self, batch_size, speaker=None, reference_spec=None, reference_spec_lens=None): embs = None + + # Get Precomputed speaker embedding + if self.precomputed_emb is not None: + return self.precomputed_emb.unsqueeze(0).repeat(batch_size, 1) # Get Lookup table speaker embedding if self.lookup_module is not None and speaker is not None: diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 10f5481be95a..8b56b84bb007 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,28 +2,28 @@ "cells": [ { "cell_type": "markdown", - "id": "646925fb", + "id": "3be8f71b", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", "\n", "This notebook is designed to provide a guide on how to run FastPitch Adapter Finetuning Pipeline. It contains the following sections:\n", - "1. **Transform pre-trained FastPitch checkpoint to adapter-compatible checkpoint**\n", - "2. **Fine-tune FastPitch on adaptation data**: fine-tune pre-trained multi-speaker FastPitch for a new speaker\n", + "1. **Fine-tune FastPitch on adaptation data**: fine-tune pre-trained multi-speaker FastPitch for a new speaker\n", "* Dataset Preparation: download dataset and extract manifest files. (duration more than 15 mins)\n", "* Preprocessing: add absolute audio paths in manifest and extract Supplementary Data.\n", + "* **Model Setting: transform pre-trained checkpoint to adapter-compatible checkpoint and precompute speaker embedding**\n", "* Training: fine-tune frozen multispeaker FastPitch with trainable adapters.\n", - "3. **Fine-tune HiFiGAN on adaptation data**: fine-tune a vocoder for the fine-tuned multi-speaker FastPitch\n", + "2. **Fine-tune HiFiGAN on adaptation data**: fine-tune a vocoder for the fine-tuned multi-speaker FastPitch\n", "* Dataset Preparation: extract mel-spectrograms from fine-tuned FastPitch.\n", "* Training: fine-tune HiFiGAN with fine-tuned adaptation data.\n", - "4. **Inference**: generate speech from adpated FastPitch\n", + "3. **Inference**: generate speech from adpated FastPitch\n", "* Load Model: load pre-trained multi-speaker FastPitch with **fine-tuned adapters**.\n", "* Output Audio: generate audio files." ] }, { "cell_type": "markdown", - "id": "1cb5bf85", + "id": "afa6b88c", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64d0a1dc", + "id": "f5d0970b", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8ed7f938", + "id": "88d49e93", "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3f63ad0d", + "id": "9bdc327c", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bf9b2e6d", + "id": "de4b2c5f", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ce31578e", + "id": "93a4b779", "metadata": {}, "outputs": [], "source": [ @@ -131,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f8e5cbc4", + "id": "46d11d4d", "metadata": {}, "outputs": [], "source": [ @@ -149,83 +149,15 @@ }, { "cell_type": "markdown", - "id": "a198a651", + "id": "ac49fd21", "metadata": {}, "source": [ - "# 1. Transform pre-trained checkpoint to adapter-compatible checkpoint" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9bc36276", - "metadata": {}, - "outputs": [], - "source": [ - "from nemo.collections.tts.models import FastPitchModel\n", - "from nemo.core import adapter_mixins\n", - "from omegaconf import DictConfig, OmegaConf, open_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b3a85cb0", - "metadata": {}, - "outputs": [], - "source": [ - "def update_model_config_to_support_adapter(config) -> DictConfig:\n", - " with open_dict(config):\n", - " enc_adapter_metadata = adapter_mixins.get_registered_adapter(config.input_fft._target_)\n", - " if enc_adapter_metadata is not None:\n", - " config.input_fft._target_ = enc_adapter_metadata.adapter_class_path\n", - "\n", - " dec_adapter_metadata = adapter_mixins.get_registered_adapter(config.output_fft._target_)\n", - " if dec_adapter_metadata is not None:\n", - " config.output_fft._target_ = dec_adapter_metadata.adapter_class_path\n", - "\n", - " pitch_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.pitch_predictor._target_)\n", - " if pitch_predictor_adapter_metadata is not None:\n", - " config.pitch_predictor._target_ = pitch_predictor_adapter_metadata.adapter_class_path\n", - "\n", - " duration_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.duration_predictor._target_)\n", - " if duration_predictor_adapter_metadata is not None:\n", - " config.duration_predictor._target_ = duration_predictor_adapter_metadata.adapter_class_path\n", - "\n", - " aligner_adapter_metadata = adapter_mixins.get_registered_adapter(config.alignment_module._target_)\n", - " if aligner_adapter_metadata is not None:\n", - " config.alignment_module._target_ = aligner_adapter_metadata.adapter_class_path\n", - "\n", - " return config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8952410e", - "metadata": {}, - "outputs": [], - "source": [ - "model = FastPitchModel.restore_from(pretrained_fastpitch_checkpoint)\n", - "model.cfg = update_model_config_to_support_adapter(model.cfg)\n", - "model.save_to('Pretrained-FastPitch.nemo')\n", - "shutil.copyfile(finetuned_hifigan_on_multispeaker_checkpoint, \"Pretrained-HifiGan.nemo\")\n", - "\n", - "pretrained_fastpitch_checkpoint = os.path.abspath(\"Pretrained-FastPitch.nemo\")\n", - "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(\"Pretrained-HifiGan.nemo\")" + "# 1. Fine-tune FastPitch on adaptation data" ] }, { "cell_type": "markdown", - "id": "19c33aab", - "metadata": {}, - "source": [ - "# 2. Fine-tune FastPitch on adaptation data" - ] - }, - { - "cell_type": "markdown", - "id": "d01c7d22", + "id": "3dbef87d", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -235,7 +167,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33c526aa", + "id": "720793cf", "metadata": {}, "outputs": [], "source": [ @@ -245,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6e00251f", + "id": "7b3d4cb0", "metadata": {}, "outputs": [], "source": [ @@ -256,7 +188,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4366aefc", + "id": "ad4c8431", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +198,7 @@ }, { "cell_type": "markdown", - "id": "50940938", + "id": "e664cb05", "metadata": {}, "source": [ "## b. Preprocessing" @@ -274,7 +206,7 @@ }, { "cell_type": "markdown", - "id": "6f51e8c3", + "id": "ccff9bf3", "metadata": {}, "source": [ "### Add absolute file path in manifest\n", @@ -284,7 +216,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c7fd91d", + "id": "f5f591f5", "metadata": {}, "outputs": [], "source": [ @@ -294,22 +226,22 @@ { "cell_type": "code", "execution_count": null, - "id": "45a97208", + "id": "7d7bc716", "metadata": {}, "outputs": [], "source": [ - "train_datas = read_manifest(train_manifest)\n", - "for m in train_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", - "write_manifest(train_manifest, train_datas)\n", + "train_data = read_manifest(train_manifest)\n", + "for m in train_data: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "write_manifest(train_manifest, train_data)\n", "\n", - "valid_datas = read_manifest(valid_manifest)\n", - "for m in valid_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", - "write_manifest(valid_manifest, valid_datas)" + "valid_data = read_manifest(valid_manifest)\n", + "for m in valid_data: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "write_manifest(valid_manifest, valid_data)" ] }, { "cell_type": "markdown", - "id": "94ce06e4", + "id": "190640b1", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -320,7 +252,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47594938", + "id": "1fb41208", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +267,7 @@ }, { "cell_type": "markdown", - "id": "826bb1d1", + "id": "a1d5078b", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -348,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "900f7d42", + "id": "ad461dcc", "metadata": {}, "outputs": [], "source": [ @@ -363,16 +295,138 @@ }, { "cell_type": "markdown", - "id": "f03c022e", + "id": "06863743", + "metadata": {}, + "source": [ + "## c. Model Setting\n", + "### Transform pre-trained checkpoint to adapter-compatible checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09641544", + "metadata": {}, + "outputs": [], + "source": [ + "from nemo.collections.tts.models import FastPitchModel\n", + "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", + "from nemo.core import adapter_mixins\n", + "from omegaconf import DictConfig, OmegaConf, open_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a5a8d8", + "metadata": {}, + "outputs": [], + "source": [ + "def update_model_config_to_support_adapter(config) -> DictConfig:\n", + " with open_dict(config):\n", + " enc_adapter_metadata = adapter_mixins.get_registered_adapter(config.input_fft._target_)\n", + " if enc_adapter_metadata is not None:\n", + " config.input_fft._target_ = enc_adapter_metadata.adapter_class_path\n", + "\n", + " dec_adapter_metadata = adapter_mixins.get_registered_adapter(config.output_fft._target_)\n", + " if dec_adapter_metadata is not None:\n", + " config.output_fft._target_ = dec_adapter_metadata.adapter_class_path\n", + "\n", + " pitch_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.pitch_predictor._target_)\n", + " if pitch_predictor_adapter_metadata is not None:\n", + " config.pitch_predictor._target_ = pitch_predictor_adapter_metadata.adapter_class_path\n", + "\n", + " duration_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.duration_predictor._target_)\n", + " if duration_predictor_adapter_metadata is not None:\n", + " config.duration_predictor._target_ = duration_predictor_adapter_metadata.adapter_class_path\n", + "\n", + " aligner_adapter_metadata = adapter_mixins.get_registered_adapter(config.alignment_module._target_)\n", + " if aligner_adapter_metadata is not None:\n", + " config.alignment_module._target_ = aligner_adapter_metadata.adapter_class_path\n", + "\n", + " return config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "540bc3d0", + "metadata": {}, + "outputs": [], + "source": [ + "spec_model = FastPitchModel.restore_from(pretrained_fastpitch_checkpoint).eval().cuda()\n", + "spec_model.cfg = update_model_config_to_support_adapter(spec_model.cfg)" + ] + }, + { + "cell_type": "markdown", + "id": "28c76aca", "metadata": {}, "source": [ - "## c. Training" + "### Precompute Speaker Embedding\n", + "Get all GST speaker embeddings from training data, take average, and save as `precomputed_emb` in FastPitch" ] }, { "cell_type": "code", "execution_count": null, - "id": "8a5ad291", + "id": "a5fa3004", + "metadata": {}, + "outputs": [], + "source": [ + "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", + "train_data = read_manifest(train_manifest)\n", + "\n", + "spk_embs = [] \n", + "for data in train_data:\n", + " with torch.no_grad():\n", + " audio = wave_model.process(data['audio_filepath'])\n", + " audio_length = torch.tensor(audio.shape[0]).long()\n", + " audio = audio.unsqueeze(0).to(device=spec_model.device)\n", + " audio_length = audio_length.unsqueeze(0).to(device=spec_model.device)\n", + " spec_ref, spec_ref_lens = spec_model.preprocessor(input_signal=audio, length=audio_length)\n", + " spk_emb = spec_model.fastpitch.get_speaker_embedding(batch_size=spec_ref.shape[0],\n", + " speaker=None,\n", + " reference_spec=spec_ref,\n", + " reference_spec_lens=spec_ref_lens)\n", + "\n", + " spk_embs.append(spk_emb.squeeze().cpu())\n", + "\n", + "spk_embs = torch.stack(spk_embs, dim=0)\n", + "spk_emb = torch.mean(spk_embs, dim=0)\n", + "\n", + "with open_dict(spec_model.cfg):\n", + " spec_model.cfg.speaker_encoder.precompute = True\n", + " spec_model.cfg.speaker_encoder.precompute_embedding_dim = spec_model.cfg.symbols_embedding_dim\n", + "\n", + "spec_model.fastpitch.speaker_encoder.register_parameter('precomputed_emb', torch.nn.Parameter(spk_emb))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c623125", + "metadata": {}, + "outputs": [], + "source": [ + "spec_model.save_to('Pretrained-FastPitch.nemo')\n", + "shutil.copyfile(finetuned_hifigan_on_multispeaker_checkpoint, \"Pretrained-HifiGan.nemo\")\n", + "pretrained_fastpitch_checkpoint = os.path.abspath(\"Pretrained-FastPitch.nemo\")\n", + "finetuned_hifigan_on_multispeaker_checkpoint = os.path.abspath(\"Pretrained-HifiGan.nemo\")" + ] + }, + { + "cell_type": "markdown", + "id": "2fd6062b", + "metadata": {}, + "source": [ + "## d. Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aef915a4", "metadata": {}, "outputs": [], "source": [ @@ -386,12 +440,13 @@ }, { "cell_type": "markdown", - "id": "0b9e8f8e", + "id": "1a5496c4", "metadata": {}, "source": [ "### Important notes\n", - "* `+init_from_ptl_ckpt`: initialize with a multi-speaker FastPitch checkpoint\n", - "* `~model.speaker_encoder.lookup_module`: remove the pre-trained looked-up speaker embedding\n", + "* `+init_from_nemo_model`: initialize with a multi-speaker FastPitch checkpoint\n", + "* `~model.speaker_encoder.lookup_module`: we use precomputed speaker embedding, so we remove the pre-trained looked-up speaker embedding\n", + "* `~model.speaker_encoder.gst_module`: we use precomputed speaker embedding, so we remove the pre-trained gst speaker embedding\n", "* Other optional arguments based on your preference:\n", " * batch_size\n", " * exp_manager\n", @@ -401,7 +456,7 @@ { "cell_type": "code", "execution_count": null, - "id": "24fd1e9a", + "id": "87434f1f", "metadata": {}, "outputs": [], "source": [ @@ -411,11 +466,13 @@ "+init_from_nemo_model={pretrained_fastpitch_checkpoint} \\\n", "train_dataset={train_manifest} \\\n", "validation_datasets={valid_manifest} \\\n", - "sup_data_types=\"['align_prior_matrix', 'pitch', 'speaker_id', 'reference_audio']\" \\\n", + "sup_data_types=\"['align_prior_matrix', 'pitch']\" \\\n", "sup_data_path={supp_dir} \\\n", "pitch_mean={PITCH_MEAN} \\\n", "pitch_std={PITCH_STD} \\\n", + "model.speaker_encoder.precompute=True \\\n", "~model.speaker_encoder.lookup_module \\\n", + "~model.speaker_encoder.gst_module \\\n", "model.train_ds.dataloader_params.batch_size=8 \\\n", "model.validation_ds.dataloader_params.batch_size=8 \\\n", "model.optim.name=adam \\\n", @@ -437,19 +494,22 @@ { "cell_type": "code", "execution_count": null, - "id": "22936e1c", + "id": "700c55c1", "metadata": {}, "outputs": [], "source": [ + "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/FastPitch.nemo\n", "# e.g. NeMoTTS_logs/FastPitch/Y-M-D_H-M-S/checkpoints/adapters.pt\n", "last_checkpoint_dir = sorted(list([i for i in (Path(logs_dir) / \"FastPitch\").iterdir() if i.is_dir()]))[-1] / \"checkpoints\"\n", + "finetuned_fastpitch_checkpoint = list(last_checkpoint_dir.glob('*.nemo'))[0]\n", "finetuned_adapter_checkpoint = list(last_checkpoint_dir.glob('adapters.pt'))[0]\n", + "print(finetuned_fastpitch_checkpoint)\n", "print(finetuned_adapter_checkpoint)" ] }, { "cell_type": "markdown", - "id": "46ff1669", + "id": "42096605", "metadata": {}, "source": [ "# 3. Fine-tune HiFiGAN on adaptation data" @@ -457,142 +517,53 @@ }, { "cell_type": "markdown", - "id": "4b16bde0", + "id": "2e85a5cc", "metadata": {}, "source": [ "## a. Dataset Preparation\n", - "Generate mel-spectrograms for HiFiGAN training.\n", - "### Important notes\n", - "* We don't use existing scripts (e.g. `resynthesize_dataset.py` or `generate_mels.py`) \n", - "* We may need `reference spectrogram` or `speaker` as the input of FastPitch.\n", - "* We call `load_adapters` to resume our FastPitch" + "Generate mel-spectrograms for HiFiGAN training." ] }, { "cell_type": "code", "execution_count": null, - "id": "6f4ed978", + "id": "7dc49387", "metadata": {}, "outputs": [], "source": [ - "from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator\n", - "from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer\n", - "from collections import defaultdict\n", - "import random\n", - "random.seed(100)" + "!cd {code_dir} \\\n", + "&& python scripts/dataset_processing/tts/resynthesize_dataset.py \\\n", + "--model-path={finetuned_fastpitch_checkpoint} \\\n", + "--input-json-manifest={train_manifest} \\\n", + "--input-sup-data-path={supp_dir} \\\n", + "--output-folder={mels_dir} \\\n", + "--device=\"cuda:0\" \\\n", + "--batch-size=1 \\\n", + "--num-workers=1 \\\n", + "&& python scripts/dataset_processing/tts/resynthesize_dataset.py \\\n", + "--model-path={finetuned_fastpitch_checkpoint} \\\n", + "--input-json-manifest={valid_manifest} \\\n", + "--input-sup-data-path={supp_dir} \\\n", + "--output-folder={mels_dir} \\\n", + "--device=\"cuda:0\" \\\n", + "--batch-size=1 \\\n", + "--num-workers=1" ] }, { "cell_type": "code", "execution_count": null, - "id": "9b21631f", + "id": "55cf4fa3", "metadata": {}, "outputs": [], "source": [ - "def gen_spectrogram(index, manifest, speaker_to_index):\n", - " \n", - " record = manifest[index]\n", - " audio_file = record[\"audio_filepath\"]\n", - " \n", - " if '.wav' in audio_file:\n", - " save_path = os.path.abspath(os.path.join(mels_dir, audio_file.split(\"/\")[-1].replace(\".wav\", \".npy\")))\n", - " \n", - " if '.flac' in audio_file:\n", - " save_path = os.path.abspath(os.path.join(mels_dir, audio_file.split(\"/\")[-1].replace(\".flac\", \".npy\")))\n", - " \n", - " if os.path.exists(save_path):\n", - " return save_path\n", - " \n", - " if \"normalized_text\" in record:\n", - " text = spec_model.parse(record[\"normalized_text\"], normalize=False)\n", - " else:\n", - " text = spec_model.parse(record['text'])\n", - " \n", - " text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=spec_model.device).unsqueeze(0)\n", - " \n", - " audio = wave_model.process(audio_file).unsqueeze(0).to(device=spec_model.device)\n", - " audio_len = torch.tensor(audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", - " spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len) \n", - " \n", - " attn_prior = torch.from_numpy(beta_binomial_interpolator(spect_len.item(), text_len.item())).unsqueeze(0).to(spec_model.device)\n", - " \n", - " reference_pool = speaker_to_index[record[\"speaker\"]] - set([index]) if len(speaker_to_index[record[\"speaker\"]]) > 1 else speaker_to_index[record[\"speaker\"]]\n", - " reference_sample = manifest[random.sample(reference_pool, 1)[0]]\n", - " reference_audio = wave_model.process(reference_sample[\"audio_filepath\"]).unsqueeze(0).to(device=spec_model.device)\n", - " reference_audio_length = torch.tensor(reference_audio.shape[1]).long().unsqueeze(0).to(device=spec_model.device)\n", - " reference_spec, reference_spec_len = spec_model.preprocessor(input_signal=reference_audio, length=reference_audio_length) \n", - " \n", - " with torch.no_grad():\n", - " spectrogram = spec_model.forward(\n", - " text=text, \n", - " input_lens=text_len,\n", - " spec=spect, \n", - " mel_lens=spect_len, \n", - " attn_prior=attn_prior,\n", - " reference_spec=reference_spec,\n", - " reference_spec_lens=reference_spec_len,\n", - " )[0]\n", - " \n", - " spec = spectrogram[0].to('cpu').numpy()\n", - " np.save(save_path, spec)\n", - " return save_path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eeefd908", - "metadata": {}, - "outputs": [], - "source": [ - "wave_model = WaveformFeaturizer(sample_rate=sample_rate)\n", - "\n", - "# Pretrained FastPitch Weights\n", - "spec_model = FastPitchModel.restore_from(pretrained_fastpitch_checkpoint)\n", - "\n", - "# Load Adapter Weights\n", - "spec_model.load_adapters(finetuned_adapter_checkpoint)\n", - "spec_model.eval().cuda()\n", - "\n", - "beta_binomial_interpolator = BetaBinomialInterpolator()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83b1705c", - "metadata": {}, - "outputs": [], - "source": [ - "os.makedirs(mels_dir, exist_ok=True)\n", - "\n", - "# Train\n", - "train_datas = read_manifest(train_manifest)\n", - "speaker_to_index = defaultdict(list)\n", - "for i, d in enumerate(train_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", - "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", - "\n", - "for i, record in enumerate(tqdm(train_datas)):\n", - " record[\"mel_filepath\"] = gen_spectrogram(i, train_datas, speaker_to_index)\n", - "\n", - "write_manifest(train_manifest, train_datas)\n", - "\n", - "\n", - "# Valid\n", - "valid_datas = read_manifest(valid_manifest)\n", - "speaker_to_index = defaultdict(list)\n", - "for i, d in enumerate(valid_datas): speaker_to_index[d.get('speaker', None)].append(i)\n", - "speaker_to_index = {k: set(v) for k, v in speaker_to_index.items()}\n", - "\n", - "for i, record in enumerate(tqdm(valid_datas)):\n", - " record[\"mel_filepath\"] = gen_spectrogram(i, valid_datas, speaker_to_index)\n", - "\n", - "write_manifest(valid_manifest, valid_datas)" + "train_manifest_mel = f\"{mels_dir}/train_mel.json\"\n", + "valid_manifest_mel = f\"{mels_dir}/dev_mel.json\"" ] }, { "cell_type": "markdown", - "id": "9044e5d8", + "id": "742395cb", "metadata": {}, "source": [ "## b. Training" @@ -601,15 +572,15 @@ { "cell_type": "code", "execution_count": null, - "id": "f72d469b", + "id": "d570edb8", "metadata": {}, "outputs": [], "source": [ "# Normally 500 epochs\n", "!cd {code_dir} && python examples/tts/hifigan_finetune.py \\\n", "--config-name=hifigan_44100.yaml \\\n", - "train_dataset={train_manifest} \\\n", - "validation_datasets={valid_manifest} \\\n", + "train_dataset={train_manifest_mel} \\\n", + "validation_datasets={valid_manifest_mel} \\\n", "+init_from_nemo_model={finetuned_hifigan_on_multispeaker_checkpoint} \\\n", "model.train_ds.dataloader_params.batch_size=32 \\\n", "model.optim.lr=0.0001 \\\n", @@ -629,7 +600,7 @@ { "cell_type": "code", "execution_count": null, - "id": "465fe036", + "id": "0df81764", "metadata": {}, "outputs": [], "source": [ @@ -641,7 +612,7 @@ }, { "cell_type": "markdown", - "id": "54f7ae0b", + "id": "93378ae9", "metadata": {}, "source": [ "# 4. Inference" @@ -650,7 +621,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4edc2f56", + "id": "8f287c7a", "metadata": {}, "outputs": [], "source": [ @@ -661,7 +632,7 @@ }, { "cell_type": "markdown", - "id": "92a88dab", + "id": "13139310", "metadata": {}, "source": [ "## a. Load Model" @@ -670,7 +641,7 @@ { "cell_type": "code", "execution_count": null, - "id": "460ad165", + "id": "729846cd", "metadata": {}, "outputs": [], "source": [ @@ -680,20 +651,23 @@ { "cell_type": "code", "execution_count": null, - "id": "4b4f8c61", + "id": "9e4b03a3", "metadata": {}, "outputs": [], "source": [ - "# FastPitch\n", - "spec_model = FastPitchModel.restore_from(pretrained_fastpitch_checkpoint)\n", - "spec_model.load_adapters(finetuned_adapter_checkpoint)\n", + "# Load from pretrained FastPitch and finetuned adapter\n", + "# spec_model = FastPitchModel.restore_from(pretrained_fastpitch_checkpoint)\n", + "# spec_model.load_adapters(finetuned_adapter_checkpoint)\n", + "\n", + "# Load from finetuned FastPitch\n", + "spec_model = FastPitchModel.restore_from(finetuned_fastpitch_checkpoint)\n", "spec_model = spec_model.eval().cuda()" ] }, { "cell_type": "code", "execution_count": null, - "id": "568a5e3c", + "id": "5adbcd4e", "metadata": {}, "outputs": [], "source": [ @@ -703,7 +677,7 @@ }, { "cell_type": "markdown", - "id": "ded03c63", + "id": "75396933", "metadata": {}, "source": [ "## b. Output Audio" @@ -712,7 +686,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f2c06bd4", + "id": "8af85268", "metadata": {}, "outputs": [], "source": [ @@ -745,7 +719,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1b876880", + "id": "39b49c4a", "metadata": {}, "outputs": [], "source": [ @@ -768,7 +742,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59a4a4bb", + "id": "3fef5770", "metadata": {}, "outputs": [], "source": [ @@ -800,43 +774,29 @@ { "cell_type": "code", "execution_count": null, - "id": "db0c6830", - "metadata": {}, - "outputs": [], - "source": [ - "fintuned_fastpitch = 'fastpitch.nemo'\n", - "fintuned_hifigan = 'hifigan.nemo'\n", - "spec_model.save_to(fintuned_fastpitch)\n", - "vocoder_model.save_to(fintuned_hifigan)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d022577", + "id": "1f391606", "metadata": {}, "outputs": [], "source": [ - "print(f\"FastPitch checkpoint: {pretrained_fastpitch_checkpoint}\")\n", - "print(f\"Adapter checkpoint: {finetuned_adapter_checkpoint}\")\n", - "print(f\"HiFi-Gan checkpoint: {finetuned_hifigan_on_adaptation_checkpoint}\")" + "print(f\"Pretraind FastPitch: {pretrained_fastpitch_checkpoint}\")\n", + "print(f\"Finetuned Adapter: {finetuned_adapter_checkpoint}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "2e8b30f5", + "id": "9ca0b7ee", "metadata": {}, "outputs": [], "source": [ - "print(f\"FastPitch nemo file: {fintuned_fastpitch}\")\n", - "print(f\"HiFi-Gan nemo file: {fintuned_hifigan}\")" + "print(f\"Finetuned FastPitch: {finetuned_fastpitch_checkpoint}\")\n", + "print(f\"Finetuned HiFi-Gan: {finetuned_hifigan_on_adaptation_checkpoint}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "7f61605c", + "id": "3b4a8f96", "metadata": {}, "outputs": [], "source": [] @@ -858,7 +818,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb index e0edbde2f845..defd0272d89d 100644 --- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb +++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "3dd6dd51", + "id": "afd8cdc9", "metadata": {}, "source": [ "# FastPitch MultiSpeaker Pretraining\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "915d3200", + "id": "4fc9c6b9", "metadata": {}, "source": [ "# License\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e0bd81d8", + "id": "b81f6c14", "metadata": {}, "outputs": [], "source": [ @@ -71,7 +71,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2cf04110", + "id": "f2f1e3ac", "metadata": {}, "outputs": [], "source": [ @@ -81,7 +81,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dc30b714", + "id": "1acd141d", "metadata": {}, "outputs": [], "source": [ @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a582efd1", + "id": "7b54c45e", "metadata": {}, "outputs": [], "source": [ @@ -116,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e1c5e7a0", + "id": "a119994b", "metadata": {}, "outputs": [], "source": [ @@ -134,7 +134,7 @@ }, { "cell_type": "markdown", - "id": "52dcee31", + "id": "dbb3ac0e", "metadata": {}, "source": [ "# 1. Pre-train FastPitch on multi-speaker data" @@ -142,7 +142,7 @@ }, { "cell_type": "markdown", - "id": "068a6ff0", + "id": "095a1fca", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -153,7 +153,7 @@ { "cell_type": "code", "execution_count": null, - "id": "155329c5", + "id": "69b17b07", "metadata": {}, "outputs": [], "source": [ @@ -163,7 +163,7 @@ { "cell_type": "code", "execution_count": null, - "id": "de97ae34", + "id": "a65e7938", "metadata": {}, "outputs": [], "source": [ @@ -174,7 +174,7 @@ { "cell_type": "code", "execution_count": null, - "id": "571eb693", + "id": "08b27b92", "metadata": {}, "outputs": [], "source": [ @@ -184,7 +184,7 @@ }, { "cell_type": "markdown", - "id": "86a56ab6", + "id": "7cbf24d6", "metadata": {}, "source": [ "## b. Preprocessing" @@ -192,7 +192,7 @@ }, { "cell_type": "markdown", - "id": "4f52f928", + "id": "cae8567d", "metadata": {}, "source": [ "### Add absoluate audio path in manifest\n", @@ -202,7 +202,7 @@ { "cell_type": "code", "execution_count": null, - "id": "756e4415", + "id": "71d2fe63", "metadata": {}, "outputs": [], "source": [ @@ -212,22 +212,22 @@ { "cell_type": "code", "execution_count": null, - "id": "86112ae9", + "id": "dc51398c", "metadata": {}, "outputs": [], "source": [ - "train_datas = read_manifest(train_manifest)\n", - "for m in train_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", - "write_manifest(train_manifest, train_datas)\n", + "train_data = read_manifest(train_manifest)\n", + "for m in train_data: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "write_manifest(train_manifest, train_data)\n", "\n", - "valid_datas = read_manifest(valid_manifest)\n", - "for m in valid_datas: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", - "write_manifest(valid_manifest, valid_datas)" + "valid_data = read_manifest(valid_manifest)\n", + "for m in valid_data: m['audio_filepath'] = os.path.abspath(os.path.join(manidir, m['audio_filepath']))\n", + "write_manifest(valid_manifest, valid_data)" ] }, { "cell_type": "markdown", - "id": "0978c35d", + "id": "678bb37c", "metadata": {}, "source": [ "### Calibrate speaker id to start from 0\n", @@ -237,23 +237,23 @@ { "cell_type": "code", "execution_count": null, - "id": "8bba736b", + "id": "594c6f2d", "metadata": {}, "outputs": [], "source": [ - "train_datas = read_manifest(train_manifest)\n", - "speaker2id = {s: _id for _id, s in enumerate(set([m['speaker'] for m in train_datas]))}\n", - "for m in train_datas: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", - "write_manifest(train_manifest, train_datas)\n", + "train_data = read_manifest(train_manifest)\n", + "speaker2id = {s: _id for _id, s in enumerate(set([m['speaker'] for m in train_data]))}\n", + "for m in train_data: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", + "write_manifest(train_manifest, train_data)\n", "\n", - "valid_datas = read_manifest(valid_manifest)\n", - "for m in valid_datas: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", - "write_manifest(valid_manifest, valid_datas)" + "valid_data = read_manifest(valid_manifest)\n", + "for m in valid_data: m['old_speaker'], m['speaker'] = m['speaker'], speaker2id[m['speaker']]\n", + "write_manifest(valid_manifest, valid_data)" ] }, { "cell_type": "markdown", - "id": "48f9c4ab", + "id": "15b6cc65", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -264,7 +264,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c30d68d1", + "id": "c3728ac9", "metadata": {}, "outputs": [], "source": [ @@ -279,7 +279,7 @@ }, { "cell_type": "markdown", - "id": "6f42b01a", + "id": "effd9182", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -292,7 +292,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ee5193bf", + "id": "37e54cd4", "metadata": {}, "outputs": [], "source": [ @@ -307,7 +307,7 @@ }, { "cell_type": "markdown", - "id": "f55e2605", + "id": "82d2c99d", "metadata": {}, "source": [ "* If you want to compute pitch mean and std for each speaker, you can use the script `compute_speaker_stats.py`\n", @@ -321,7 +321,7 @@ }, { "cell_type": "markdown", - "id": "04ea7c7a", + "id": "a7c8dfb6", "metadata": {}, "source": [ "## c. Training" @@ -330,7 +330,7 @@ { "cell_type": "code", "execution_count": null, - "id": "254ed2e1", + "id": "e378a792", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "9b38817f", + "id": "a90ddfb3", "metadata": {}, "source": [ "### Important notes\n", @@ -376,7 +376,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4828dbb8", + "id": "ac22f3a8", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3f8f8d4d", + "id": "b6fc98a5", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ }, { "cell_type": "markdown", - "id": "51c072be", + "id": "b175f755", "metadata": {}, "source": [ "# 2. Fine-tune HiFiGAN on multi-speaker data" @@ -439,16 +439,17 @@ }, { "cell_type": "markdown", - "id": "552df656", + "id": "5749a0b8", "metadata": {}, "source": [ - "## a. Dataset Preparation" + "## a. Dataset Preparation\n", + "Generate mel-spectrograms for HiFiGAN training." ] }, { "cell_type": "code", "execution_count": null, - "id": "d697370f", + "id": "3d77bda9", "metadata": {}, "outputs": [], "source": [ @@ -474,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "545056bb", + "id": "8c9159a1", "metadata": {}, "outputs": [], "source": [ @@ -484,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "91b319d2", + "id": "24653f24", "metadata": {}, "source": [ "## b. Training" @@ -493,7 +494,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9f8a7a7", + "id": "fadc0410", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +522,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b9a0fbe1", + "id": "864fe5ba", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +534,7 @@ }, { "cell_type": "markdown", - "id": "ce0d1d20", + "id": "e04540b6", "metadata": {}, "source": [ "# 3. Inference" @@ -542,7 +543,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f4cca59", + "id": "fdf662f7", "metadata": {}, "outputs": [], "source": [ @@ -556,7 +557,7 @@ }, { "cell_type": "markdown", - "id": "b73658a1", + "id": "270a3264", "metadata": {}, "source": [ "## a. Load Model" @@ -565,7 +566,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d8e9d78", + "id": "01315a66", "metadata": {}, "outputs": [], "source": [ @@ -575,7 +576,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a25cb651", + "id": "536c8fdc", "metadata": {}, "outputs": [], "source": [ @@ -586,7 +587,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6959280f", + "id": "a2ace7c4", "metadata": {}, "outputs": [], "source": [ @@ -596,7 +597,7 @@ }, { "cell_type": "markdown", - "id": "598cec4e", + "id": "cf4a42fa", "metadata": {}, "source": [ "## b. Output Audio" @@ -605,7 +606,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef4e73c2", + "id": "1b376468", "metadata": {}, "outputs": [], "source": [ @@ -640,7 +641,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ee925bc5", + "id": "f93f73a6", "metadata": {}, "outputs": [], "source": [ @@ -666,7 +667,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed143375", + "id": "77590752", "metadata": {}, "outputs": [], "source": [ @@ -701,7 +702,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f40868c", + "id": "8cd156e4", "metadata": {}, "outputs": [], "source": [ @@ -726,7 +727,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.10" } }, "nbformat": 4, From e9ba7f6f2f30865c0146cf4aecb9b414cfb7140e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 May 2023 02:12:29 +0000 Subject: [PATCH 20/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/collections/tts/modules/fastpitch.py | 12 ++++++------ nemo/collections/tts/modules/submodules.py | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index 42a70309c74b..10109fea1000 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -177,7 +177,7 @@ def __init__( self.learn_alignment = aligner is not None self.use_duration_predictor = True self.binarize = False - + # TODO: combine self.speaker_emb with self.speaker_encoder # cfg: remove `n_speakers`, create `speaker_encoder.lookup_module` # state_dict: move `speaker_emb.weight` to `speaker_encoder.lookup_module.table.weight` @@ -282,8 +282,8 @@ def forward( # Calculate speaker embedding spk_emb = self.get_speaker_embedding( batch_size=text.shape[0], - speaker=speaker, - reference_spec=reference_spec, + speaker=speaker, + reference_spec=reference_spec, reference_spec_lens=reference_spec_lens, ) @@ -382,12 +382,12 @@ def infer( reference_spec=None, reference_spec_lens=None, ): - + # Calculate speaker embedding spk_emb = self.get_speaker_embedding( batch_size=text.shape[0], - speaker=speaker, - reference_spec=reference_spec, + speaker=speaker, + reference_spec=reference_spec, reference_spec_lens=reference_spec_lens, ) diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index 655a0a1f126c..acea746d4231 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -716,18 +716,18 @@ def __init__(self, lookup_module=None, gst_module=None, precompute=False, precom precompute: Use precompute speaker embedding """ super(SpeakerEncoder, self).__init__() - + # Multi-speaker embedding self.lookup_module = lookup_module - + # Reference speaker embedding self.gst_module = gst_module - + if precompute: self.precomputed_emb = torch.nn.Parameter(torch.empty(precompute_embedding_dim)) else: self.register_parameter('precomputed_emb', None) - + @property def input_types(self): return { @@ -745,7 +745,7 @@ def output_types(self): def forward(self, batch_size, speaker=None, reference_spec=None, reference_spec_lens=None): embs = None - + # Get Precomputed speaker embedding if self.precomputed_emb is not None: return self.precomputed_emb.unsqueeze(0).repeat(batch_size, 1) From 4795fe15c59a87012b808024a29282b6abef8011 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Mon, 1 May 2023 19:13:44 -0700 Subject: [PATCH 21/25] Fix space Signed-off-by: hsiehjackson --- nemo/collections/tts/modules/fastpitch.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index 42a70309c74b..5eb304b669cf 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -177,7 +177,6 @@ def __init__( self.learn_alignment = aligner is not None self.use_duration_predictor = True self.binarize = False - # TODO: combine self.speaker_emb with self.speaker_encoder # cfg: remove `n_speakers`, create `speaker_encoder.lookup_module` # state_dict: move `speaker_emb.weight` to `speaker_encoder.lookup_module.table.weight` @@ -382,7 +381,6 @@ def infer( reference_spec=None, reference_spec_lens=None, ): - # Calculate speaker embedding spk_emb = self.get_speaker_embedding( batch_size=text.shape[0], From 4b375d9b996bfeadacc0a7b7e88e2493953f0d4a Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Tue, 2 May 2023 11:56:30 -0700 Subject: [PATCH 22/25] Remove repeated argument Signed-off-by: hsiehjackson --- .../conf/fastpitch_align_44100_adapter.yaml | 3 +- nemo/collections/tts/modules/submodules.py | 13 +- .../tts/FastPitch_Adapter_Finetuning.ipynb | 115 +++++++++--------- 3 files changed, 67 insertions(+), 64 deletions(-) diff --git a/examples/tts/conf/fastpitch_align_44100_adapter.yaml b/examples/tts/conf/fastpitch_align_44100_adapter.yaml index 85f740ab5773..b2957b057d28 100644 --- a/examples/tts/conf/fastpitch_align_44100_adapter.yaml +++ b/examples/tts/conf/fastpitch_align_44100_adapter.yaml @@ -248,8 +248,7 @@ model: speaker_encoder: _target_: nemo.collections.tts.modules.submodules.SpeakerEncoder - precompute: false - precompute_embedding_dim: ${model.symbols_embedding_dim} + precomputed_embedding_dim: null lookup_module: _target_: nemo.collections.tts.modules.submodules.SpeakerLookupTable n_speakers: ??? diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index acea746d4231..72d853f3d3e7 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -709,11 +709,11 @@ class SpeakerEncoder represents speakers representation. This module can combine GST (global style token) based speaker embeddings and lookup table speaker embeddings. """ - def __init__(self, lookup_module=None, gst_module=None, precompute=False, precompute_embedding_dim=None): + def __init__(self, lookup_module=None, gst_module=None, precomputed_embedding_dim=None): """ lookup_module: Torch module to get lookup based speaker embedding gst_module: Neural module to get GST based speaker embedding - precompute: Use precompute speaker embedding + precomputed_embedding_dim: Give precomputed speaker embedding dimension to use precompute speaker embedding """ super(SpeakerEncoder, self).__init__() @@ -723,10 +723,10 @@ def __init__(self, lookup_module=None, gst_module=None, precompute=False, precom # Reference speaker embedding self.gst_module = gst_module - if precompute: - self.precomputed_emb = torch.nn.Parameter(torch.empty(precompute_embedding_dim)) + if precomputed_embedding_dim is not None: + self.precomputed_emb = torch.nn.Parameter(torch.empty(precomputed_embedding_dim)) else: - self.register_parameter('precomputed_emb', None) + self.precomputed_emb = None @property def input_types(self): @@ -742,6 +742,9 @@ def output_types(self): return { "embs": NeuralType(('B', 'D'), EncodedRepresentation()), } + + def overwrite_precomputed_emb(self, emb): + self.precomputed_emb = torch.nn.Parameter(emb) def forward(self, batch_size, speaker=None, reference_spec=None, reference_spec_lens=None): embs = None diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 8b56b84bb007..186b241d30ef 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "3be8f71b", + "id": "4d5e96d5", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "afa6b88c", + "id": "f71417c8", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f5d0970b", + "id": "5ed77850", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88d49e93", + "id": "30c00546", "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9bdc327c", + "id": "3a2eaa27", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "de4b2c5f", + "id": "ca2f7996", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "93a4b779", + "id": "2c6a587e", "metadata": {}, "outputs": [], "source": [ @@ -131,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46d11d4d", + "id": "397f4cf8", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ }, { "cell_type": "markdown", - "id": "ac49fd21", + "id": "467fa160", "metadata": {}, "source": [ "# 1. Fine-tune FastPitch on adaptation data" @@ -157,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "3dbef87d", + "id": "e1ed84a4", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -167,7 +167,7 @@ { "cell_type": "code", "execution_count": null, - "id": "720793cf", + "id": "40a81e28", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b3d4cb0", + "id": "35bd0031", "metadata": {}, "outputs": [], "source": [ @@ -188,7 +188,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ad4c8431", + "id": "0f0446cc", "metadata": {}, "outputs": [], "source": [ @@ -198,7 +198,7 @@ }, { "cell_type": "markdown", - "id": "e664cb05", + "id": "5da5540d", "metadata": {}, "source": [ "## b. Preprocessing" @@ -206,7 +206,7 @@ }, { "cell_type": "markdown", - "id": "ccff9bf3", + "id": "6ed76ea3", "metadata": {}, "source": [ "### Add absolute file path in manifest\n", @@ -216,7 +216,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f5f591f5", + "id": "c4cfa38c", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d7bc716", + "id": "37449a6d", "metadata": {}, "outputs": [], "source": [ @@ -241,7 +241,7 @@ }, { "cell_type": "markdown", - "id": "190640b1", + "id": "ea8faef9", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -252,7 +252,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1fb41208", + "id": "59c52fe5", "metadata": {}, "outputs": [], "source": [ @@ -267,7 +267,7 @@ }, { "cell_type": "markdown", - "id": "a1d5078b", + "id": "39e2ed47", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -280,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ad461dcc", + "id": "7e1a8f04", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "06863743", + "id": "a4225952", "metadata": {}, "source": [ "## c. Model Setting\n", @@ -305,7 +305,7 @@ { "cell_type": "code", "execution_count": null, - "id": "09641544", + "id": "03c74454", "metadata": {}, "outputs": [], "source": [ @@ -318,7 +318,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6a5a8d8", + "id": "4ee4d0ad", "metadata": {}, "outputs": [], "source": [ @@ -350,7 +350,7 @@ { "cell_type": "code", "execution_count": null, - "id": "540bc3d0", + "id": "0dbe5059", "metadata": {}, "outputs": [], "source": [ @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "28c76aca", + "id": "58a28965", "metadata": {}, "source": [ "### Precompute Speaker Embedding\n", @@ -370,7 +370,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a5fa3004", + "id": "ed1c4f4a", "metadata": {}, "outputs": [], "source": [ @@ -394,18 +394,18 @@ "\n", "spk_embs = torch.stack(spk_embs, dim=0)\n", "spk_emb = torch.mean(spk_embs, dim=0)\n", + "spk_emb_dim = spk_emb.shape[0]\n", "\n", "with open_dict(spec_model.cfg):\n", - " spec_model.cfg.speaker_encoder.precompute = True\n", - " spec_model.cfg.speaker_encoder.precompute_embedding_dim = spec_model.cfg.symbols_embedding_dim\n", + " spec_model.cfg.speaker_encoder.precomputed_embedding_dim = spec_model.cfg.symbols_embedding_dim\n", "\n", - "spec_model.fastpitch.speaker_encoder.register_parameter('precomputed_emb', torch.nn.Parameter(spk_emb))" + "spec_model.fastpitch.speaker_encoder.overwrite_precomputed_emb(spk_emb)" ] }, { "cell_type": "code", "execution_count": null, - "id": "6c623125", + "id": "0684d0d8", "metadata": {}, "outputs": [], "source": [ @@ -417,7 +417,7 @@ }, { "cell_type": "markdown", - "id": "2fd6062b", + "id": "f16a9427", "metadata": {}, "source": [ "## d. Training" @@ -426,7 +426,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aef915a4", + "id": "e8b0492c", "metadata": {}, "outputs": [], "source": [ @@ -440,11 +440,12 @@ }, { "cell_type": "markdown", - "id": "1a5496c4", + "id": "06905ed2", "metadata": {}, "source": [ "### Important notes\n", "* `+init_from_nemo_model`: initialize with a multi-speaker FastPitch checkpoint\n", + "* `model.speaker_encoder.precompute=True`: use precomputed speaker embedding\n", "* `~model.speaker_encoder.lookup_module`: we use precomputed speaker embedding, so we remove the pre-trained looked-up speaker embedding\n", "* `~model.speaker_encoder.gst_module`: we use precomputed speaker embedding, so we remove the pre-trained gst speaker embedding\n", "* Other optional arguments based on your preference:\n", @@ -456,7 +457,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87434f1f", + "id": "b46d962c", "metadata": {}, "outputs": [], "source": [ @@ -470,7 +471,7 @@ "sup_data_path={supp_dir} \\\n", "pitch_mean={PITCH_MEAN} \\\n", "pitch_std={PITCH_STD} \\\n", - "model.speaker_encoder.precompute=True \\\n", + "model.speaker_encoder.precomputed_embedding_dim={spk_emb_dim} \\\n", "~model.speaker_encoder.lookup_module \\\n", "~model.speaker_encoder.gst_module \\\n", "model.train_ds.dataloader_params.batch_size=8 \\\n", @@ -494,7 +495,7 @@ { "cell_type": "code", "execution_count": null, - "id": "700c55c1", + "id": "058cca91", "metadata": {}, "outputs": [], "source": [ @@ -509,7 +510,7 @@ }, { "cell_type": "markdown", - "id": "42096605", + "id": "2a3a149a", "metadata": {}, "source": [ "# 3. Fine-tune HiFiGAN on adaptation data" @@ -517,7 +518,7 @@ }, { "cell_type": "markdown", - "id": "2e85a5cc", + "id": "b08eb8a1", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -527,7 +528,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7dc49387", + "id": "cc4b912b", "metadata": {}, "outputs": [], "source": [ @@ -553,7 +554,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55cf4fa3", + "id": "55cac67f", "metadata": {}, "outputs": [], "source": [ @@ -563,7 +564,7 @@ }, { "cell_type": "markdown", - "id": "742395cb", + "id": "5c1d0af1", "metadata": {}, "source": [ "## b. Training" @@ -572,7 +573,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d570edb8", + "id": "47b293c0", "metadata": {}, "outputs": [], "source": [ @@ -600,7 +601,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0df81764", + "id": "dbe10ad9", "metadata": {}, "outputs": [], "source": [ @@ -612,7 +613,7 @@ }, { "cell_type": "markdown", - "id": "93378ae9", + "id": "ac27f618", "metadata": {}, "source": [ "# 4. Inference" @@ -621,7 +622,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8f287c7a", + "id": "1f468c2d", "metadata": {}, "outputs": [], "source": [ @@ -632,7 +633,7 @@ }, { "cell_type": "markdown", - "id": "13139310", + "id": "67f7de09", "metadata": {}, "source": [ "## a. Load Model" @@ -641,7 +642,7 @@ { "cell_type": "code", "execution_count": null, - "id": "729846cd", + "id": "009e2463", "metadata": {}, "outputs": [], "source": [ @@ -651,7 +652,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e4b03a3", + "id": "be639296", "metadata": {}, "outputs": [], "source": [ @@ -667,7 +668,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5adbcd4e", + "id": "70001e21", "metadata": {}, "outputs": [], "source": [ @@ -677,7 +678,7 @@ }, { "cell_type": "markdown", - "id": "75396933", + "id": "95e549ee", "metadata": {}, "source": [ "## b. Output Audio" @@ -686,7 +687,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8af85268", + "id": "c8395276", "metadata": {}, "outputs": [], "source": [ @@ -719,7 +720,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39b49c4a", + "id": "40d2bab5", "metadata": {}, "outputs": [], "source": [ @@ -742,7 +743,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3fef5770", + "id": "4a14cb9b", "metadata": {}, "outputs": [], "source": [ @@ -774,7 +775,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f391606", + "id": "32d84ee3", "metadata": {}, "outputs": [], "source": [ @@ -785,7 +786,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ca0b7ee", + "id": "4ee3dfeb", "metadata": {}, "outputs": [], "source": [ @@ -796,7 +797,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3b4a8f96", + "id": "e300f066", "metadata": {}, "outputs": [], "source": [] From 908aa67db3434f0dcedceb9625fb34899a91888d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 May 2023 19:04:55 +0000 Subject: [PATCH 23/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/collections/tts/modules/submodules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index 72d853f3d3e7..eb44fb25fcf6 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -742,7 +742,7 @@ def output_types(self): return { "embs": NeuralType(('B', 'D'), EncodedRepresentation()), } - + def overwrite_precomputed_emb(self, emb): self.precomputed_emb = torch.nn.Parameter(emb) From 4e75a6e2fda1b24d9d8dbaf9d0da73bcfd9484b0 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Tue, 2 May 2023 14:32:17 -0700 Subject: [PATCH 24/25] optional batch size Signed-off-by: hsiehjackson --- nemo/collections/tts/modules/submodules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index 72d853f3d3e7..b5277076ded7 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -731,7 +731,7 @@ def __init__(self, lookup_module=None, gst_module=None, precomputed_embedding_di @property def input_types(self): return { - "batch_size": NeuralType(), + "batch_size": NeuralType(optional=True), "speaker": NeuralType(('B'), Index(), optional=True), "reference_spec": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType(), optional=True), "reference_spec_lens": NeuralType(('B'), LengthsType(), optional=True), @@ -746,7 +746,7 @@ def output_types(self): def overwrite_precomputed_emb(self, emb): self.precomputed_emb = torch.nn.Parameter(emb) - def forward(self, batch_size, speaker=None, reference_spec=None, reference_spec_lens=None): + def forward(self, batch_size=None, speaker=None, reference_spec=None, reference_spec_lens=None): embs = None # Get Precomputed speaker embedding From 3053a524f0724f95b3002c4822a1c3eb0e3da4f6 Mon Sep 17 00:00:00 2001 From: hsiehjackson Date: Tue, 2 May 2023 16:32:43 -0700 Subject: [PATCH 25/25] Fix comments in notebook Signed-off-by: hsiehjackson --- .../tts/FastPitch_Adapter_Finetuning.ipynb | 108 +++++++++--------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb index 186b241d30ef..fa1b1bdc90c8 100644 --- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "4d5e96d5", + "id": "ea49c0e5", "metadata": {}, "source": [ "# FastPitch Adapter Finetuning\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "f71417c8", + "id": "37259555", "metadata": {}, "source": [ "# License\n", @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5ed77850", + "id": "d61cbea5", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30c00546", + "id": "fef9aba9", "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3a2eaa27", + "id": "49bc38ab", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca2f7996", + "id": "9459f9dc", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2c6a587e", + "id": "eb26f54d", "metadata": {}, "outputs": [], "source": [ @@ -131,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "397f4cf8", + "id": "12b28329", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ }, { "cell_type": "markdown", - "id": "467fa160", + "id": "30996769", "metadata": {}, "source": [ "# 1. Fine-tune FastPitch on adaptation data" @@ -157,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "e1ed84a4", + "id": "2f5f5945", "metadata": {}, "source": [ "## a. Data Preparation\n", @@ -167,7 +167,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40a81e28", + "id": "8047f988", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35bd0031", + "id": "b8242769", "metadata": {}, "outputs": [], "source": [ @@ -188,7 +188,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f0446cc", + "id": "79cf8539", "metadata": {}, "outputs": [], "source": [ @@ -198,7 +198,7 @@ }, { "cell_type": "markdown", - "id": "5da5540d", + "id": "35c3b97b", "metadata": {}, "source": [ "## b. Preprocessing" @@ -206,7 +206,7 @@ }, { "cell_type": "markdown", - "id": "6ed76ea3", + "id": "ba3a7c3a", "metadata": {}, "source": [ "### Add absolute file path in manifest\n", @@ -216,7 +216,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c4cfa38c", + "id": "8bc485b5", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ { "cell_type": "code", "execution_count": null, - "id": "37449a6d", + "id": "f9cb8ef5", "metadata": {}, "outputs": [], "source": [ @@ -241,7 +241,7 @@ }, { "cell_type": "markdown", - "id": "ea8faef9", + "id": "f92054d5", "metadata": {}, "source": [ "### Extract Supplementary Data\n", @@ -252,7 +252,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59c52fe5", + "id": "0adc618b", "metadata": {}, "outputs": [], "source": [ @@ -267,7 +267,7 @@ }, { "cell_type": "markdown", - "id": "39e2ed47", + "id": "96dd5fe1", "metadata": {}, "source": [ "After running the above command line, you will observe a new folder NeMoTTS_sup_data/pitch and printouts of pitch statistics like below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", @@ -280,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7e1a8f04", + "id": "23703c76", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "a4225952", + "id": "7c70e5db", "metadata": {}, "source": [ "## c. Model Setting\n", @@ -305,7 +305,7 @@ { "cell_type": "code", "execution_count": null, - "id": "03c74454", + "id": "439f2f82", "metadata": {}, "outputs": [], "source": [ @@ -318,7 +318,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4ee4d0ad", + "id": "30f865cb", "metadata": {}, "outputs": [], "source": [ @@ -350,7 +350,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0dbe5059", + "id": "e92910b5", "metadata": {}, "outputs": [], "source": [ @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "58a28965", + "id": "7f03219f", "metadata": {}, "source": [ "### Precompute Speaker Embedding\n", @@ -370,7 +370,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed1c4f4a", + "id": "c2a35241", "metadata": {}, "outputs": [], "source": [ @@ -405,7 +405,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0684d0d8", + "id": "5fa1b309", "metadata": {}, "outputs": [], "source": [ @@ -417,7 +417,7 @@ }, { "cell_type": "markdown", - "id": "f16a9427", + "id": "3b77e95f", "metadata": {}, "source": [ "## d. Training" @@ -426,7 +426,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e8b0492c", + "id": "9e8c3740", "metadata": {}, "outputs": [], "source": [ @@ -440,12 +440,12 @@ }, { "cell_type": "markdown", - "id": "06905ed2", + "id": "19bb6d8b", "metadata": {}, "source": [ "### Important notes\n", "* `+init_from_nemo_model`: initialize with a multi-speaker FastPitch checkpoint\n", - "* `model.speaker_encoder.precompute=True`: use precomputed speaker embedding\n", + "* `model.speaker_encoder.precomputed_embedding_dim={spk_emb_dim}`: use precomputed speaker embedding\n", "* `~model.speaker_encoder.lookup_module`: we use precomputed speaker embedding, so we remove the pre-trained looked-up speaker embedding\n", "* `~model.speaker_encoder.gst_module`: we use precomputed speaker embedding, so we remove the pre-trained gst speaker embedding\n", "* Other optional arguments based on your preference:\n", @@ -457,7 +457,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b46d962c", + "id": "8c8cbea2", "metadata": {}, "outputs": [], "source": [ @@ -495,7 +495,7 @@ { "cell_type": "code", "execution_count": null, - "id": "058cca91", + "id": "fe5c7b2f", "metadata": {}, "outputs": [], "source": [ @@ -510,7 +510,7 @@ }, { "cell_type": "markdown", - "id": "2a3a149a", + "id": "75856d0e", "metadata": {}, "source": [ "# 3. Fine-tune HiFiGAN on adaptation data" @@ -518,7 +518,7 @@ }, { "cell_type": "markdown", - "id": "b08eb8a1", + "id": "3444698f", "metadata": {}, "source": [ "## a. Dataset Preparation\n", @@ -528,7 +528,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cc4b912b", + "id": "bb2fd64d", "metadata": {}, "outputs": [], "source": [ @@ -554,7 +554,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55cac67f", + "id": "da69cb66", "metadata": {}, "outputs": [], "source": [ @@ -564,7 +564,7 @@ }, { "cell_type": "markdown", - "id": "5c1d0af1", + "id": "fa2cbb02", "metadata": {}, "source": [ "## b. Training" @@ -573,7 +573,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47b293c0", + "id": "ffdce5d5", "metadata": {}, "outputs": [], "source": [ @@ -601,7 +601,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dbe10ad9", + "id": "9e6376cf", "metadata": {}, "outputs": [], "source": [ @@ -613,7 +613,7 @@ }, { "cell_type": "markdown", - "id": "ac27f618", + "id": "e5076e51", "metadata": {}, "source": [ "# 4. Inference" @@ -622,7 +622,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f468c2d", + "id": "52358549", "metadata": {}, "outputs": [], "source": [ @@ -633,7 +633,7 @@ }, { "cell_type": "markdown", - "id": "67f7de09", + "id": "9e96ee13", "metadata": {}, "source": [ "## a. Load Model" @@ -642,7 +642,7 @@ { "cell_type": "code", "execution_count": null, - "id": "009e2463", + "id": "2cb5d524", "metadata": {}, "outputs": [], "source": [ @@ -652,7 +652,7 @@ { "cell_type": "code", "execution_count": null, - "id": "be639296", + "id": "32dbd30c", "metadata": {}, "outputs": [], "source": [ @@ -668,7 +668,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70001e21", + "id": "74a7ad03", "metadata": {}, "outputs": [], "source": [ @@ -678,7 +678,7 @@ }, { "cell_type": "markdown", - "id": "95e549ee", + "id": "4f882975", "metadata": {}, "source": [ "## b. Output Audio" @@ -687,7 +687,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c8395276", + "id": "2178a8ef", "metadata": {}, "outputs": [], "source": [ @@ -720,7 +720,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40d2bab5", + "id": "766154e3", "metadata": {}, "outputs": [], "source": [ @@ -743,7 +743,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4a14cb9b", + "id": "dfa71ca6", "metadata": {}, "outputs": [], "source": [ @@ -775,7 +775,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32d84ee3", + "id": "51d9d176", "metadata": {}, "outputs": [], "source": [ @@ -786,7 +786,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4ee3dfeb", + "id": "6180a7d2", "metadata": {}, "outputs": [], "source": [ @@ -797,7 +797,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e300f066", + "id": "5b33263b", "metadata": {}, "outputs": [], "source": []