diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index fc48bc06b3ca..120969ee9dfa 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -885,9 +885,9 @@ Hybrid ASR-TTS Model Configuration :ref:`Hybrid ASR-TTS model ` consists of three parts: -* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``) +* ASR model (``EncDecCTCModelBPE``, ``EncDecRNNTBPEModel`` or ``EncDecHybridRNNTCTCBPEModel``) * TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) -* Enhancer model (optional) +* :ref:`Enhancer model ` (optional) Also, the config allows to specify :ref:`text-only dataset `. @@ -895,7 +895,7 @@ Main parts of the config: * ASR model * ``asr_model_path``: path to the ASR model checkpoint (`.nemo`) file, loaded only once, then the config of the ASR model is stored in the ``asr_model`` field - * ``asr_model_type``: needed only when training from scratch, ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE`` + * ``asr_model_type``: needed only when training from scratch. ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE``, ``hybrid_rnnt_ctc_bpe`` to ``EncDecHybridRNNTCTCBPEModel`` * ``asr_model_fuse_bn``: fusing BatchNorm in the pretrained ASR model, can improve quality in finetuning scenario * TTS model * ``tts_model_path``: path to the pretrained TTS model checkpoint (`.nemo`) file, loaded only once, then the config of the model is stored in the ``tts_model`` field @@ -907,7 +907,7 @@ Main parts of the config: * ``speakers_filepath``: path (or paths) to the text file containing speaker ids for the multi-speaker TTS model (speakers are sampled randomly during training) * ``min_words`` and ``max_words``: parameters to filter text-only manifests by the number of words * ``tokenizer_workers``: number of workers for initial tokenization (when loading the data). ``num_CPUs / num_GPUs`` is a recommended value. - * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). See parameters for ``nemo.collections.common.data.ConcatDataset`` + * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). Correspond to ``sampling_technique``, ``sampling_temperature``, and ``sampling_probabilities`` parameters of the :mod:`ConcatDataset `. * all other components are similar to conventional ASR models * ``validation_ds`` and ``test_ds`` correspond to the underlying ASR model @@ -920,7 +920,7 @@ Main parts of the config: # asr model asr_model_path: ??? asr_model: null - asr_model_type: null # rnnt_bpe or ctc_bpe, needed only if instantiating from config, otherwise type is auto inferred + asr_model_type: null # rnnt_bpe, ctc_bpe or hybrid_rnnt_ctc_bpe; needed only if instantiating from config, otherwise type is auto inferred asr_model_fuse_bn: false # only ConformerEncoder supported now, use false for other models # tts model @@ -972,6 +972,7 @@ Training from Scratch To train ASR model from scratch using text-only data use ``/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py`` script with conventional ASR model config, e.g. ``/examples/asr/conf/conformer/conformer_ctc_bpe.yaml`` or ``/examples/asr/conf/conformer/conformer_transducer_bpe.yaml`` Please specify the ASR model type, paths to the TTS model, and (optional) enhancer, along with text-only data-related fields. +Use ``++`` or ``+`` markers for these options, since the options are not present in the original ASR model config. .. code-block:: shell diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index 2323e1636fcc..80a0fd90f0fb 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -330,7 +330,7 @@ The model consists of three models: * ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``) * Frozen TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) -* Optional frozen Enhancer model trained to mitigate mismatch between real and generated mel spectrogram +* Optional frozen :ref:`Spectrogram Enhancer model ` model trained to mitigate mismatch between real and generated mel spectrogram .. image:: images/hybrid_asr_tts_model.png :align: center diff --git a/docs/source/common/data.rst b/docs/source/common/data.rst new file mode 100644 index 000000000000..4c2f38cbba83 --- /dev/null +++ b/docs/source/common/data.rst @@ -0,0 +1,13 @@ +Data +---- + +.. autoclass:: nemo.collections.common.data.dataset.ConcatDataset + :show-inheritance: + :members: + :undoc-members: + + +.. autoclass:: nemo.collections.common.data.dataset.ConcatMapDataset + :show-inheritance: + :members: + :undoc-members: diff --git a/docs/source/common/intro.rst b/docs/source/common/intro.rst index dbe8d5d17930..fadbd9528485 100644 --- a/docs/source/common/intro.rst +++ b/docs/source/common/intro.rst @@ -10,3 +10,4 @@ The common collection contains things that could be used across all collections. losses metrics tokenizers + data diff --git a/docs/source/tts/api.rst b/docs/source/tts/api.rst index 2b706132fc0d..e291a995d3cb 100644 --- a/docs/source/tts/api.rst +++ b/docs/source/tts/api.rst @@ -25,6 +25,11 @@ Mel-Spectrogram Generators :members: :exclude-members: setup_training_data, setup_validation_data, training_step, validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start +.. autoclass:: nemo.collections.tts.models.SpectrogramEnhancerModel + :show-inheritance: + :members: + :exclude-members: setup_training_data, setup_validation_data, training_step, validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start + Speech-to-Text Aligner Models ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/tts/models.rst b/docs/source/tts/models.rst index 8b283529a706..fedfd157c307 100644 --- a/docs/source/tts/models.rst +++ b/docs/source/tts/models.rst @@ -112,7 +112,7 @@ Speech-to-text alignment is a critical component of neural TTS models. Autoregre End2End Models --------- +-------------- VITS ~~~~~~~~~~~~~~~ @@ -123,6 +123,17 @@ VITS is an end-to-end speech synthesis model, which generates raw waveform audio :alt: vits model :scale: 25% + +Enhancers +--------- + +.. _SpectrogramEnhancer_model: + +Spectrogram Enhancer +~~~~~~~~~~~~~~~~~~~~ +GAN-based model to add details to blurry spectrograms from TTS models like Tacotron or FastPitch. + + References ---------- diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py index 386a567cf2dc..946202364c53 100644 --- a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py +++ b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py @@ -19,7 +19,7 @@ ```shell python speech_to_text_bpe_with_text.py \ # (Optional: --config-path= --config-name=) \ - ++asr_model_type= \ + ++asr_model_type= \ ++tts_model_path= \ ++enhancer_model_path= \ model.tokenizer.dir= \ diff --git a/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml b/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml index e933fc59b40f..bdd483215632 100644 --- a/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml +++ b/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml @@ -8,7 +8,7 @@ model: # asr model asr_model_path: ??? asr_model: null - asr_model_type: null # rnnt_bpe or ctc_bpe, needed only if instantiating from config, otherwise type is auto inferred + asr_model_type: null # rnnt_bpe, ctc_bpe or hybrid_rnnt_ctc_bpe; needed only if instantiating from config, otherwise type is auto inferred asr_model_fuse_bn: false # only ConformerEncoder supported now, use false for other models # tts model diff --git a/nemo/collections/asr/data/text_to_text.py b/nemo/collections/asr/data/text_to_text.py index 23ccd3d7a2ef..88b417ea21bc 100644 --- a/nemo/collections/asr/data/text_to_text.py +++ b/nemo/collections/asr/data/text_to_text.py @@ -37,8 +37,7 @@ try: from nemo_text_processing.text_normalization.normalize import Normalizer except Exception as e: - logging.warning(e) - logging.warning("nemo_text_processing is not installed") + pass # Normalizer imported only for annotation purposes, error can be ignored AnyPath = Union[Path, str] diff --git a/nemo/collections/asr/models/hybrid_asr_tts_models.py b/nemo/collections/asr/models/hybrid_asr_tts_models.py index 1f15e49e0b0d..8486f956c3b7 100644 --- a/nemo/collections/asr/models/hybrid_asr_tts_models.py +++ b/nemo/collections/asr/models/hybrid_asr_tts_models.py @@ -33,6 +33,7 @@ ) from nemo.collections.asr.models.asr_model import ASRModel from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE +from nemo.collections.asr.models.hybrid_rnnt_ctc_bpe_models import EncDecHybridRNNTCTCBPEModel from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder from nemo.collections.asr.parts.preprocessing.features import clean_spectrogram_batch, normalize_batch @@ -89,7 +90,7 @@ class ASRWithTTSModel(ASRModel): Text-only data can be mixed with audio-text pairs """ - asr_model: Union[EncDecRNNTBPEModel, EncDecCTCModelBPE] + asr_model: Union[EncDecRNNTBPEModel, EncDecCTCModelBPE, EncDecHybridRNNTCTCBPEModel] tts_model: FastPitchModel enhancer_model: Optional[SpectrogramEnhancerModel] @@ -100,6 +101,7 @@ class ASRModelTypes(PrettyStrEnum): RNNT_BPE = "rnnt_bpe" CTC_BPE = "ctc_bpe" + HYBRID_RNNT_CTC_BPE = "hybrid_rnnt_ctc_bpe" @classmethod def from_asr_model(cls, model: Any): @@ -107,6 +109,8 @@ def from_asr_model(cls, model: Any): return cls.RNNT_BPE if isinstance(model, EncDecCTCModelBPE): return cls.CTC_BPE + if isinstance(model, EncDecHybridRNNTCTCBPEModel): + return cls.HYBRID_RNNT_CTC_BPE raise ValueError(f"Unsupported model type: {type(model)}") def get_asr_cls(self): @@ -114,6 +118,8 @@ def get_asr_cls(self): return EncDecRNNTBPEModel if self == self.CTC_BPE: return EncDecCTCModelBPE + if self == self.HYBRID_RNNT_CTC_BPE: + return EncDecHybridRNNTCTCBPEModel raise NotImplementedError(f"Not implemented for value {self.value}") @classmethod @@ -540,7 +546,7 @@ def _setup_text_dataset_from_config( manifest_filepath=text_data_config.manifest_filepath, speakers_filepath=text_data_config.speakers_filepath, asr_tokenizer=self.asr_model.tokenizer, - asr_use_start_end_token=train_data_config.use_start_end_token, + asr_use_start_end_token=train_data_config.get("use_start_end_token", False), tts_parser=self.tts_model.parser, tts_text_pad_id=self.tts_model.vocab.pad, tts_text_normalizer=self.tts_model.normalizer, @@ -556,7 +562,7 @@ def _setup_text_dataset_from_config( manifest_filepath=text_data_config.manifest_filepath, speakers_filepath=text_data_config.speakers_filepath, asr_tokenizer=self.asr_model.tokenizer, - asr_use_start_end_token=train_data_config.use_start_end_token, + asr_use_start_end_token=train_data_config.get("use_start_end_token", False), tts_parser=self.tts_model.parser, tts_text_pad_id=self.tts_model.vocab.pad, tts_text_normalizer=self.tts_model.normalizer,