From 0eb3b4aa9121fd0ff598162e7041637104d36df5 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 May 2023 16:51:20 +0400 Subject: [PATCH 01/11] ASR-TTS: support hybrid RNNT-CTC models Signed-off-by: Vladimir Bataev --- docs/source/asr/configs.rst | 2 +- examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py | 2 +- examples/asr/conf/asr_tts/hybrid_asr_tts.yaml | 2 +- nemo/collections/asr/models/hybrid_asr_tts_models.py | 8 +++++++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index fc48bc06b3ca..c0a5c24c5e9e 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -920,7 +920,7 @@ Main parts of the config: # asr model asr_model_path: ??? asr_model: null - asr_model_type: null # rnnt_bpe or ctc_bpe, needed only if instantiating from config, otherwise type is auto inferred + asr_model_type: null # rnnt_bpe, ctc_bpe or hybrid_rnnt_ctc_bpe; needed only if instantiating from config, otherwise type is auto inferred asr_model_fuse_bn: false # only ConformerEncoder supported now, use false for other models # tts model diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py index 386a567cf2dc..946202364c53 100644 --- a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py +++ b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py @@ -19,7 +19,7 @@ ```shell python speech_to_text_bpe_with_text.py \ # (Optional: --config-path= --config-name=) \ - ++asr_model_type= \ + ++asr_model_type= \ ++tts_model_path= \ ++enhancer_model_path= \ model.tokenizer.dir= \ diff --git a/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml b/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml index e933fc59b40f..bdd483215632 100644 --- a/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml +++ b/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml @@ -8,7 +8,7 @@ model: # asr model asr_model_path: ??? asr_model: null - asr_model_type: null # rnnt_bpe or ctc_bpe, needed only if instantiating from config, otherwise type is auto inferred + asr_model_type: null # rnnt_bpe, ctc_bpe or hybrid_rnnt_ctc_bpe; needed only if instantiating from config, otherwise type is auto inferred asr_model_fuse_bn: false # only ConformerEncoder supported now, use false for other models # tts model diff --git a/nemo/collections/asr/models/hybrid_asr_tts_models.py b/nemo/collections/asr/models/hybrid_asr_tts_models.py index 1f15e49e0b0d..6be99f3c4ac4 100644 --- a/nemo/collections/asr/models/hybrid_asr_tts_models.py +++ b/nemo/collections/asr/models/hybrid_asr_tts_models.py @@ -33,6 +33,7 @@ ) from nemo.collections.asr.models.asr_model import ASRModel from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE +from nemo.collections.asr.models.hybrid_rnnt_ctc_bpe_models import EncDecHybridRNNTCTCBPEModel from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder from nemo.collections.asr.parts.preprocessing.features import clean_spectrogram_batch, normalize_batch @@ -89,7 +90,7 @@ class ASRWithTTSModel(ASRModel): Text-only data can be mixed with audio-text pairs """ - asr_model: Union[EncDecRNNTBPEModel, EncDecCTCModelBPE] + asr_model: Union[EncDecRNNTBPEModel, EncDecCTCModelBPE, EncDecHybridRNNTCTCBPEModel] tts_model: FastPitchModel enhancer_model: Optional[SpectrogramEnhancerModel] @@ -100,6 +101,7 @@ class ASRModelTypes(PrettyStrEnum): RNNT_BPE = "rnnt_bpe" CTC_BPE = "ctc_bpe" + HYBRID_RNNT_CTC_BPE = "hybrid_rnnt_ctc_bpe" @classmethod def from_asr_model(cls, model: Any): @@ -107,6 +109,8 @@ def from_asr_model(cls, model: Any): return cls.RNNT_BPE if isinstance(model, EncDecCTCModelBPE): return cls.CTC_BPE + if isinstance(model, EncDecHybridRNNTCTCBPEModel): + return cls.HYBRID_RNNT_CTC_BPE raise ValueError(f"Unsupported model type: {type(model)}") def get_asr_cls(self): @@ -114,6 +118,8 @@ def get_asr_cls(self): return EncDecRNNTBPEModel if self == self.CTC_BPE: return EncDecCTCModelBPE + if self == self.HYBRID_RNNT_CTC_BPE: + return EncDecHybridRNNTCTCBPEModel raise NotImplementedError(f"Not implemented for value {self.value}") @classmethod From 27a9ada3b3b55a38bd15ce9c37ee20e3104d5801 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 May 2023 17:25:38 +0400 Subject: [PATCH 02/11] Do not warn on optional import. Separate import guard Signed-off-by: Vladimir Bataev --- nemo/collections/asr/data/text_to_text.py | 6 ++---- nemo/utils/import_guards.py | 24 +++++++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 nemo/utils/import_guards.py diff --git a/nemo/collections/asr/data/text_to_text.py b/nemo/collections/asr/data/text_to_text.py index 23ccd3d7a2ef..6f20a9b544d8 100644 --- a/nemo/collections/asr/data/text_to_text.py +++ b/nemo/collections/asr/data/text_to_text.py @@ -33,12 +33,10 @@ from nemo.collections.common.tokenizers import TokenizerSpec from nemo.core.classes import Dataset, IterableDataset from nemo.utils import logging +from nemo.utils.import_guards import optional_import_guard -try: +with optional_import_guard(): from nemo_text_processing.text_normalization.normalize import Normalizer -except Exception as e: - logging.warning(e) - logging.warning("nemo_text_processing is not installed") AnyPath = Union[Path, str] diff --git a/nemo/utils/import_guards.py b/nemo/utils/import_guards.py new file mode 100644 index 000000000000..e96903c33577 --- /dev/null +++ b/nemo/utils/import_guards.py @@ -0,0 +1,24 @@ +from contextlib import contextmanager + +from nemo.utils import logging + + +@contextmanager +def optional_import_guard(warn_on_error=False): + """ + Context manager to wrap optional import. + Suppresses ImportError(also, ModuleNotFoundError), adds warning if `warn_on_error` is True. + Use separately for each library. + + >>> with optional_import_guard(): + ... import optional_library + + :param warn_on_error: log warning if import resulted in error + """ + try: + yield + except ImportError as e: + if warn_on_error: + logging.warning(e) + finally: + pass From 93441202848624187506cf6ff3be6fdea06ac042 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 May 2023 17:29:46 +0400 Subject: [PATCH 03/11] Explain adding options to config Signed-off-by: Vladimir Bataev --- docs/source/asr/configs.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index c0a5c24c5e9e..53c64c329cd8 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -972,6 +972,7 @@ Training from Scratch To train ASR model from scratch using text-only data use ``/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py`` script with conventional ASR model config, e.g. ``/examples/asr/conf/conformer/conformer_ctc_bpe.yaml`` or ``/examples/asr/conf/conformer/conformer_transducer_bpe.yaml`` Please specify the ASR model type, paths to the TTS model, and (optional) enhancer, along with text-only data-related fields. +Use ``++`` or ``+`` markers for these options, since the options are not present in the original ASR model config. .. code-block:: shell From 8461a63aa4948f5b57c8624bcb4f650f83df341e Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 May 2023 17:47:55 +0400 Subject: [PATCH 04/11] Fix import guard docs Signed-off-by: Vladimir Bataev --- nemo/utils/import_guards.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/utils/import_guards.py b/nemo/utils/import_guards.py index e96903c33577..0827da32bf98 100644 --- a/nemo/utils/import_guards.py +++ b/nemo/utils/import_guards.py @@ -7,7 +7,7 @@ def optional_import_guard(warn_on_error=False): """ Context manager to wrap optional import. - Suppresses ImportError(also, ModuleNotFoundError), adds warning if `warn_on_error` is True. + Suppresses `ImportError` (also, `ModuleNotFoundError`), adds warning if `warn_on_error` is True. Use separately for each library. >>> with optional_import_guard(): From 99ac7262839aea7f95e8ca709f90dbaa06e20259 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 May 2023 17:48:05 +0400 Subject: [PATCH 05/11] Add docs for ConcatDataset Signed-off-by: Vladimir Bataev --- docs/source/common/data.rst | 13 +++++++++++++ docs/source/common/intro.rst | 1 + 2 files changed, 14 insertions(+) create mode 100644 docs/source/common/data.rst diff --git a/docs/source/common/data.rst b/docs/source/common/data.rst new file mode 100644 index 000000000000..4c2f38cbba83 --- /dev/null +++ b/docs/source/common/data.rst @@ -0,0 +1,13 @@ +Data +---- + +.. autoclass:: nemo.collections.common.data.dataset.ConcatDataset + :show-inheritance: + :members: + :undoc-members: + + +.. autoclass:: nemo.collections.common.data.dataset.ConcatMapDataset + :show-inheritance: + :members: + :undoc-members: diff --git a/docs/source/common/intro.rst b/docs/source/common/intro.rst index dbe8d5d17930..fadbd9528485 100644 --- a/docs/source/common/intro.rst +++ b/docs/source/common/intro.rst @@ -10,3 +10,4 @@ The common collection contains things that could be used across all collections. losses metrics tokenizers + data From a09abced6c60004a24a152dba6379b92b6fea3ea Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 May 2023 17:53:48 +0400 Subject: [PATCH 06/11] Add explanation for sampling parameters Signed-off-by: Vladimir Bataev --- docs/source/asr/configs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index 53c64c329cd8..b6794f7594c5 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -907,7 +907,7 @@ Main parts of the config: * ``speakers_filepath``: path (or paths) to the text file containing speaker ids for the multi-speaker TTS model (speakers are sampled randomly during training) * ``min_words`` and ``max_words``: parameters to filter text-only manifests by the number of words * ``tokenizer_workers``: number of workers for initial tokenization (when loading the data). ``num_CPUs / num_GPUs`` is a recommended value. - * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). See parameters for ``nemo.collections.common.data.ConcatDataset`` + * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). Correspond to ``sampling_technique``, ``sampling_temperature``, and ``sampling_probabilities`` parameters of the :mod:`ConcatDataset `. * all other components are similar to conventional ASR models * ``validation_ds`` and ``test_ds`` correspond to the underlying ASR model From 0de5855cdf95e2041188c50433390b84fce08b71 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 May 2023 18:01:12 +0400 Subject: [PATCH 07/11] Fix supported models Signed-off-by: Vladimir Bataev --- docs/source/asr/configs.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index b6794f7594c5..dbd8c6ec4a42 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -885,7 +885,7 @@ Hybrid ASR-TTS Model Configuration :ref:`Hybrid ASR-TTS model ` consists of three parts: -* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``) +* ASR model (``EncDecCTCModelBPE``, ``EncDecRNNTBPEModel`` or ``EncDecHybridRNNTCTCBPEModel``) * TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) * Enhancer model (optional) @@ -895,7 +895,7 @@ Main parts of the config: * ASR model * ``asr_model_path``: path to the ASR model checkpoint (`.nemo`) file, loaded only once, then the config of the ASR model is stored in the ``asr_model`` field - * ``asr_model_type``: needed only when training from scratch, ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE`` + * ``asr_model_type``: needed only when training from scratch. ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE``, ``hybrid_rnnt_ctc_bpe`` to ``EncDecHybridRNNTCTCBPEModel`` * ``asr_model_fuse_bn``: fusing BatchNorm in the pretrained ASR model, can improve quality in finetuning scenario * TTS model * ``tts_model_path``: path to the pretrained TTS model checkpoint (`.nemo`) file, loaded only once, then the config of the model is stored in the ``tts_model`` field From cf28a9899cfba118f8ecbb435f7cfb958d5404cb Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 May 2023 18:26:00 +0400 Subject: [PATCH 08/11] Initial docs for the enhancer model Signed-off-by: Vladimir Bataev --- docs/source/asr/configs.rst | 2 +- docs/source/asr/models.rst | 2 +- docs/source/tts/api.rst | 5 +++++ docs/source/tts/models.rst | 13 ++++++++++++- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index dbd8c6ec4a42..120969ee9dfa 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -887,7 +887,7 @@ Hybrid ASR-TTS Model Configuration * ASR model (``EncDecCTCModelBPE``, ``EncDecRNNTBPEModel`` or ``EncDecHybridRNNTCTCBPEModel``) * TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) -* Enhancer model (optional) +* :ref:`Enhancer model ` (optional) Also, the config allows to specify :ref:`text-only dataset `. diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index 2323e1636fcc..80a0fd90f0fb 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -330,7 +330,7 @@ The model consists of three models: * ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``) * Frozen TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) -* Optional frozen Enhancer model trained to mitigate mismatch between real and generated mel spectrogram +* Optional frozen :ref:`Spectrogram Enhancer model ` model trained to mitigate mismatch between real and generated mel spectrogram .. image:: images/hybrid_asr_tts_model.png :align: center diff --git a/docs/source/tts/api.rst b/docs/source/tts/api.rst index 2b706132fc0d..e291a995d3cb 100644 --- a/docs/source/tts/api.rst +++ b/docs/source/tts/api.rst @@ -25,6 +25,11 @@ Mel-Spectrogram Generators :members: :exclude-members: setup_training_data, setup_validation_data, training_step, validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start +.. autoclass:: nemo.collections.tts.models.SpectrogramEnhancerModel + :show-inheritance: + :members: + :exclude-members: setup_training_data, setup_validation_data, training_step, validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start + Speech-to-Text Aligner Models ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/tts/models.rst b/docs/source/tts/models.rst index 8b283529a706..fedfd157c307 100644 --- a/docs/source/tts/models.rst +++ b/docs/source/tts/models.rst @@ -112,7 +112,7 @@ Speech-to-text alignment is a critical component of neural TTS models. Autoregre End2End Models --------- +-------------- VITS ~~~~~~~~~~~~~~~ @@ -123,6 +123,17 @@ VITS is an end-to-end speech synthesis model, which generates raw waveform audio :alt: vits model :scale: 25% + +Enhancers +--------- + +.. _SpectrogramEnhancer_model: + +Spectrogram Enhancer +~~~~~~~~~~~~~~~~~~~~ +GAN-based model to add details to blurry spectrograms from TTS models like Tacotron or FastPitch. + + References ---------- From c98a5715609d1ebb2cd10d25a8652d05600a5884 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 May 2023 19:50:22 +0400 Subject: [PATCH 09/11] Fix copyright header Signed-off-by: Vladimir Bataev --- nemo/utils/import_guards.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/nemo/utils/import_guards.py b/nemo/utils/import_guards.py index 0827da32bf98..258704ea3739 100644 --- a/nemo/utils/import_guards.py +++ b/nemo/utils/import_guards.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from contextlib import contextmanager from nemo.utils import logging From 736a3e116ded1aa051de4be2444356a12d93e0fd Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Fri, 12 May 2023 19:45:21 +0400 Subject: [PATCH 10/11] Fix use_start_end_token parameter usage Signed-off-by: Vladimir Bataev --- nemo/collections/asr/models/hybrid_asr_tts_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/models/hybrid_asr_tts_models.py b/nemo/collections/asr/models/hybrid_asr_tts_models.py index 6be99f3c4ac4..8486f956c3b7 100644 --- a/nemo/collections/asr/models/hybrid_asr_tts_models.py +++ b/nemo/collections/asr/models/hybrid_asr_tts_models.py @@ -546,7 +546,7 @@ def _setup_text_dataset_from_config( manifest_filepath=text_data_config.manifest_filepath, speakers_filepath=text_data_config.speakers_filepath, asr_tokenizer=self.asr_model.tokenizer, - asr_use_start_end_token=train_data_config.use_start_end_token, + asr_use_start_end_token=train_data_config.get("use_start_end_token", False), tts_parser=self.tts_model.parser, tts_text_pad_id=self.tts_model.vocab.pad, tts_text_normalizer=self.tts_model.normalizer, @@ -562,7 +562,7 @@ def _setup_text_dataset_from_config( manifest_filepath=text_data_config.manifest_filepath, speakers_filepath=text_data_config.speakers_filepath, asr_tokenizer=self.asr_model.tokenizer, - asr_use_start_end_token=train_data_config.use_start_end_token, + asr_use_start_end_token=train_data_config.get("use_start_end_token", False), tts_parser=self.tts_model.parser, tts_text_pad_id=self.tts_model.vocab.pad, tts_text_normalizer=self.tts_model.normalizer, From 1652ea53c32cccc6cae8eb26acda468bd5c60fa5 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Fri, 12 May 2023 21:50:25 +0400 Subject: [PATCH 11/11] Revert import_guard, do not warn on import error for Normalizer Signed-off-by: Vladimir Bataev --- nemo/collections/asr/data/text_to_text.py | 5 +-- nemo/utils/import_guards.py | 38 ----------------------- 2 files changed, 3 insertions(+), 40 deletions(-) delete mode 100644 nemo/utils/import_guards.py diff --git a/nemo/collections/asr/data/text_to_text.py b/nemo/collections/asr/data/text_to_text.py index 6f20a9b544d8..88b417ea21bc 100644 --- a/nemo/collections/asr/data/text_to_text.py +++ b/nemo/collections/asr/data/text_to_text.py @@ -33,10 +33,11 @@ from nemo.collections.common.tokenizers import TokenizerSpec from nemo.core.classes import Dataset, IterableDataset from nemo.utils import logging -from nemo.utils.import_guards import optional_import_guard -with optional_import_guard(): +try: from nemo_text_processing.text_normalization.normalize import Normalizer +except Exception as e: + pass # Normalizer imported only for annotation purposes, error can be ignored AnyPath = Union[Path, str] diff --git a/nemo/utils/import_guards.py b/nemo/utils/import_guards.py deleted file mode 100644 index 258704ea3739..000000000000 --- a/nemo/utils/import_guards.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from contextlib import contextmanager - -from nemo.utils import logging - - -@contextmanager -def optional_import_guard(warn_on_error=False): - """ - Context manager to wrap optional import. - Suppresses `ImportError` (also, `ModuleNotFoundError`), adds warning if `warn_on_error` is True. - Use separately for each library. - - >>> with optional_import_guard(): - ... import optional_library - - :param warn_on_error: log warning if import resulted in error - """ - try: - yield - except ImportError as e: - if warn_on_error: - logging.warning(e) - finally: - pass