Merge remote-tracking branch 'upstream/main' into lstm-onnx

NVIDIA · Feb 9, 2023 · e140b7f · e140b7f
2 parents 34a65d4 + dfdd8f0
commit e140b7f
Show file tree

Hide file tree

Showing 9 changed files with 100 additions and 53 deletions.
diff --git a/README.rst b/README.rst
@@ -98,7 +98,7 @@ Key Features
 * `Speech synthesis (TTS) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tts/intro.html#>`_
     * Spectrogram generation: Tacotron2, GlowTTS, TalkNet, FastPitch, FastSpeech2, Mixer-TTS, Mixer-TTS-X
     * Vocoders: WaveGlow, SqueezeWave, UniGlow, MelGAN, HiFiGAN, UnivNet
-    * End-to-end speech generation: FastPitch_HifiGan_E2E, FastSpeech2_HifiGan_E2E
+    * End-to-end speech generation: FastPitch_HifiGan_E2E, FastSpeech2_HifiGan_E2E, VITS
     * `NGC collection of pre-trained TTS models. <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_tts>`_
 * `Tools <https://github.com/NVIDIA/NeMo/tree/stable/tools>`_
     * `Text Processing (text normalization and inverse text normalization) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/text_normalization/intro.html>`_

diff --git a/docs/source/tts/checkpoints.rst b/docs/source/tts/checkpoints.rst
@@ -144,4 +144,10 @@ Vocoders
 .. csv-table::
    :file: data/ngc_models_vocoder.csv
    :align: left
+   :header-rows: 1
+End2End models
+^^^^^^^^
+.. csv-table::
+   :file: data/ngc_models_e2e.csv
+   :align: left
    :header-rows: 1
diff --git a/docs/source/tts/data/ngc_models_e2e.csv b/docs/source/tts/data/ngc_models_e2e.csv
@@ -0,0 +1,2 @@
+Locale,Model Name,Dataset,Sampling Rate,#Spk,Phoneme Unit,Model Class,Overview,Checkpoint
+en-US,tts_en_lj_vits,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.vits.VitsModel,`tts_en_lj_vits <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_en_lj_vits>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_vits/versions/1.13.0/files/vits_ljspeech_fp16_full.nemo``
diff --git a/nemo/collections/tts/models/vits.py b/nemo/collections/tts/models/vits.py
@@ -372,7 +372,14 @@ def setup_test_data(self, cfg):
     @classmethod
     def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         list_of_models = []
-        # TODO: List available models??
+        model = PretrainedModelInfo(
+            pretrained_model_name="tts_en_lj_vits",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_vits/versions/1.13.0/files/vits_ljspeech_fp16_full.nemo",
+            description="This model is trained on LJSpeech audio sampled at 22050Hz. This model has been tested on generating female English "
+            "voices with an American accent.",
+            class_=cls,
+        )
+        list_of_models.append(model)
         return list_of_models
 
     @typecheck(

diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py
@@ -287,7 +287,7 @@ def forward(
 
         # Predict energy
         if self.energy_predictor is not None:
-            energy_pred = self.energy_predictor(enc_out, enc_mask).squeeze(-1)
+            energy_pred = self.energy_predictor(prosody_input, enc_mask).squeeze(-1)
 
             if energy is not None:
                 # Average energy over characters
@@ -357,6 +357,8 @@ def infer(self, *, text, pitch=None, speaker=None, energy=None, pace=1.0, volume
         )
         pitch_predicted = self.pitch_predictor(prosody_input, enc_mask) + pitch
         pitch_emb = self.pitch_emb(pitch_predicted.unsqueeze(1))
+        enc_out = enc_out + pitch_emb.transpose(1, 2)
+
         if self.energy_predictor is not None:
             if energy is not None:
                 assert energy.shape[-1] == text.shape[-1], f"energy.shape[-1]: {energy.shape[-1]} != len(text)"
@@ -365,7 +367,6 @@ def infer(self, *, text, pitch=None, speaker=None, energy=None, pace=1.0, volume
                 energy_pred = self.energy_predictor(prosody_input, enc_mask).squeeze(-1)
                 energy_emb = self.energy_emb(energy_pred.unsqueeze(1))
             enc_out = enc_out + energy_emb.transpose(1, 2)
-        enc_out = enc_out + pitch_emb.transpose(1, 2)
 
         # Expand to decoder time dimension
         len_regulated, dec_lens = regulate_len(durs_predicted, enc_out, pace)

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,60 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[tool.isort]
+profile = "black"  # black-compatible
+line_length = 119  # should match black parameters
+ignore_whitespace = true  # ignore whitespace for compatibility with the initial style
+py_version = 38  # python 3.8 as a target version
+known_first_party = ["nemo"]  # FIRSTPARTY section
+known_third_party = ["nemo_text_processing", "examples", "scripts"]  # THIRDPARTY section
+sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
+default_section = "THIRDPARTY"
+extend_skip = ["setup.py", "docs/source/conf.py"]
+
+
+[tool.pytest.ini_options]
+# durations=0 will display all tests execution time, sorted in ascending order starting from from the slowest one.
+# -vv will also display tests with durration = 0.00s
+addopts = "--verbose --pyargs --durations=0 --strict-markers"  # always add these arguments to pytest
+testpaths = ["tests"]
+# directories to ignore when discovering tests
+norecursedirs = [
+    "nemo",
+    "nemo_text_processing",
+    "external",
+    "examples",
+    "docs",
+    "scripts",
+    "tools",
+    "tutorials",
+    "*.egg",
+    ".*",
+    "_darcs",
+    "build",
+    "CVS",
+    "dist",
+    "venv",
+    "{arch}"
+]
+# markers to select tests, use `pytest --markers` to see all available markers, `pytest -m "<marker>"` to select tests
+markers = [
+    "unit: marks unit test, i.e. testing a single, well isolated functionality (deselect with '-m \"not unit\"')",
+    "integration: marks test checking the elements when integrated into subsystems (deselect with '-m \"not integration\"')",
+    "system: marks test working at the highest integration level (deselect with '-m \"not system\"')",
+    "acceptance: marks test checking whether the developed product/model passes the user defined acceptance criteria (deselect with '-m \"not acceptance\"')",
+    "docs: mark tests related to documentation (deselect with '-m \"not docs\"')",
+    "skipduringci: marks tests that are skipped ci as they are addressed by Jenkins jobs but should be run to test user setups",
+    "pleasefixme: marks tests that are broken and need fixing",
+]
diff --git a/scripts/installers/install_torchaudio_latest.sh b/scripts/installers/install_torchaudio_latest.sh
@@ -26,22 +26,40 @@ LATEST_RELEASE=$(git -c 'versionsort.suffix=-' \
 # expected TORCHAUDIO_BUILD_VERSION=*.**.*
 TORCHAUDIO_BUILD_VERSION=${LATEST_RELEASE:8:1}${PYTORCH_VERSION:1:5}
 
+TORCH_MAJOR_VERSION=$(python3 -c "major_version = \"${PYTORCH_VERSION}\".split('.')[0]; print(major_version)")
+TORCH_MINOR_VERSION=$(python3 -c "minor_version = \"${PYTORCH_VERSION}\".split('.')[1]; print(minor_version)")
+TORCHAUDIO_MINOR_VERSION=$(python3 -c "minor_version = \"${LATEST_RELEASE}\".rsplit('.')[-1]; print(minor_version)")
+
+if [[ $TORCH_MAJOR_VERSION -ne 1 ]]; then
+    echo "WARNING: Pytorch major version different from 1 not supported"
+fi
+
 echo "Latest torchaudio release: ${LATEST_RELEASE:8:4}"
 echo "Pytorch version: ${PYTORCH_VERSION:0:6}"
 echo "Torchaudio build version: ${TORCHAUDIO_BUILD_VERSION}"
 
+if [[ "$TORCH_MINOR_VERSION" -lt "$TORCHAUDIO_MINOR_VERSION" ]]; then
+    # for old containers, we need to install matching torchaudio version
+    INSTALL_BRANCH="release/0.${TORCH_MINOR_VERSION}"
+else
+    # for new containers use latest release
+    INSTALL_BRANCH=${LATEST_RELEASE}
+fi
+
+echo "Installing torchaudio from branch: ${INSTALL_BRANCH}"
+
 # we need parameterized to run torchaudio tests
 # suppose that we do not have parameterized installed yet
 pip install parameterized
 
 # Build torchaudio and run MFCC test
-git clone --depth 1 --branch ${LATEST_RELEASE} https://github.com/pytorch/audio.git && \
+git clone --depth 1 --branch ${INSTALL_BRANCH} https://github.com/pytorch/audio.git && \
 cd audio && \
 git submodule update --init --recursive && \
 BUILD_SOX=1 BUILD_VERSION=${TORCHAUDIO_BUILD_VERSION} python setup.py install && \
 cd .. && \
 pytest -rs audio/test/torchaudio_unittest/transforms/torchscript_consistency_cpu_test.py -k 'test_MFCC' || \
-(echo "ERROR: Failed to install torchaudio!"; exit 1);
+{ echo "ERROR: Failed to install torchaudio!"; exit 1; };
 # RNNT loss is built with CUDA, so checking it will suffice
 # This test will be skipped if CUDA is not available (e.g. when building from docker)
 pytest -rs audio/test/torchaudio_unittest/functional/torchscript_consistency_cuda_test.py -k 'test_rnnt_loss' || \

diff --git a/setup.cfg b/setup.cfg
diff --git a/setup.py b/setup.py
@@ -134,8 +134,6 @@ def req_file(filename, folder="requirements"):
 
 extras_require['slu'] = list(chain([extras_require['slu'], extras_require['asr']]))
 
-tests_requirements = extras_require["test"]
-
 
 ###############################################################################
 #                            Code style checkers                              #
@@ -257,8 +255,6 @@ def finalize_options(self):
     ],
     packages=setuptools.find_packages(),
     install_requires=install_requires,
-    setup_requires=['pytest-runner'],
-    tests_require=tests_requirements,
     # List additional groups of dependencies here (e.g. development
     # dependencies). You can install these using the following syntax,
     # $ pip install -e ".[all]"