Merge branch 'main' into jbaczek/bugfix/update_LayerNorm1P_API

NVIDIA · Jan 9, 2024 · 143aab7 · 143aab7
2 parents c22e364 + 20adcc3
commit 143aab7
Show file tree

Hide file tree

Showing 64 changed files with 2,300 additions and 2,139 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -54,7 +54,7 @@ RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
 RUN git clone https://github.com/NVIDIA/apex.git && \
   cd apex && \
   git checkout 52e18c894223800cb611682dce27d88050edf1de && \
-  pip install install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
+  pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
 
 RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
   cd TransformerEngine && \

diff --git a/README.rst b/README.rst
@@ -327,7 +327,7 @@ To install Apex, run
     git clone https://github.com/NVIDIA/apex.git
     cd apex
     git checkout 52e18c894223800cb611682dce27d88050edf1de
-    pip install install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
+    pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
 
 It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies.
 

diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
@@ -229,11 +229,11 @@ Miscellaneous Classes
 CTC Decoding
 ~~~~~~~~~~~~
 
-.. autoclass:: nemo.collections.asr.metrics.wer.CTCDecoding
+.. autoclass:: nemo.collections.asr.parts.submodules.ctc_decoding.CTCDecoding
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.metrics.wer_bpe.CTCBPEDecoding
+.. autoclass:: nemo.collections.asr.parts.submodules.ctc_decoding.CTCBPEDecoding
     :show-inheritance:
     :members:
 
@@ -248,11 +248,11 @@ CTC Decoding
 RNNT Decoding
 ~~~~~~~~~~~~~
 
-.. autoclass:: nemo.collections.asr.metrics.rnnt_wer.RNNTDecoding
+.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_decoding.RNNTDecoding
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.metrics.rnnt_wer_bpe.RNNTBPEDecoding
+.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_decoding.RNNTBPEDecoding
     :show-inheritance:
     :members:
 

diff --git a/docs/source/asr/speaker_diarization/datasets.rst b/docs/source/asr/speaker_diarization/datasets.rst
@@ -205,14 +205,14 @@ The following are descriptions about each field in an input manifest JSON file.
 
 ``ctm_filepath`` (Optional):
 
-  CTM file is used for the evaluation of word-level diarization results and word-timestamp alignment. CTM file follows the following convention: ``<uniq-id> <speaker ID> <word start time> <word end time> <word> <confidence>`` Since confidence is not required for evaluating diarization results, it can have any value. Note that the ``<speaker_id>`` should be exactly matched with speaker IDs in RTTM. 
+  The CTM file is used for the evaluation of word-level diarization results and word-timestamp alignment. The CTM file follows this convention: ``<session name> <channel ID> <start time> <duration> <word> <confidence> <type of token> <speaker>``. Note that the ``<speaker>`` should exactly match speaker IDs in RTTM. Since confidence is not required for evaluating diarization results, we assign ``<confidence>`` the value ``NA``. If the type of token is words, we assign ``<type of token>`` as ``lex``.  
 
   Example lines of CTM file:
 
 .. code-block:: bash
   
-   TS3012d.Mix-Headset MTD046ID 12.879 0.32 okay 0
-   TS3012d.Mix-Headset MTD046ID 13.203 0.24 yeah 0
+   TS3012d.Mix-Headset 1 12.879 0.32 okay NA lex MTD046ID
+   TS3012d.Mix-Headset 1 13.203 0.24 yeah NA lex MTD046ID
 
 
 Evaluation on Benchmark Datasets

diff --git a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
@@ -47,8 +47,8 @@
 import torch
 from omegaconf import OmegaConf
 
-from nemo.collections.asr.metrics.wer import CTCDecodingConfig
 from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel
+from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
 from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
 from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR
 from nemo.collections.asr.parts.utils.transcribe_utils import (

diff --git a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
@@ -67,8 +67,9 @@
 import pytorch_lightning as pl
 import torch
 from omegaconf import OmegaConf, open_dict
-from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig
+
 from nemo.collections.asr.models import EncDecHybridRNNTCTCModel, EncDecRNNTModel
+from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
 from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
 from nemo.collections.asr.parts.utils.streaming_utils import (
     BatchedFrameASRRNNT,

diff --git a/examples/asr/asr_vad/speech_to_text_with_vad.py b/examples/asr/asr_vad/speech_to_text_with_vad.py
@@ -69,9 +69,10 @@
 from tqdm import tqdm
 
 from nemo.collections.asr.data import feature_to_text_dataset
-from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig
-from nemo.collections.asr.metrics.wer import CTCDecodingConfig, word_error_rate
+from nemo.collections.asr.metrics.wer import word_error_rate
 from nemo.collections.asr.models import ASRModel, EncDecClassificationModel
+from nemo.collections.asr.parts.submodules import CTCDecodingConfig
+from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
 from nemo.collections.asr.parts.utils.vad_utils import (
     generate_overlap_vad_seq,

diff --git a/examples/asr/quantization/speech_to_text_quant_infer.py b/examples/asr/quantization/speech_to_text_quant_infer.py
@@ -23,8 +23,9 @@
 import torch
 from omegaconf import open_dict
 
-from nemo.collections.asr.metrics.wer import WER, CTCDecoding, CTCDecodingConfig, word_error_rate
+from nemo.collections.asr.metrics.wer import WER, word_error_rate
 from nemo.collections.asr.models import EncDecCTCModel
+from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
 from nemo.utils import logging
 
 try:

diff --git a/examples/asr/quantization/speech_to_text_quant_infer_trt.py b/examples/asr/quantization/speech_to_text_quant_infer_trt.py
@@ -25,8 +25,9 @@
 import torch
 from omegaconf import open_dict
 
-from nemo.collections.asr.metrics.wer import WER, CTCDecoding, CTCDecodingConfig, word_error_rate
+from nemo.collections.asr.metrics.wer import WER, word_error_rate
 from nemo.collections.asr.models import EncDecCTCModel
+from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
 from nemo.utils import logging
 
 # Use autoprimaryctx if available (pycuda >= 2021.1) to

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -21,10 +21,10 @@
 import torch
 from omegaconf import OmegaConf, open_dict
 
-from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig
-from nemo.collections.asr.metrics.wer import CTCDecodingConfig
 from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel
 from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
+from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
+from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
 from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import (

diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py
@@ -80,10 +80,10 @@
 from omegaconf import MISSING, OmegaConf
 
 from nemo.collections.asr.data.audio_to_text_dataset import ASRPredictionWriter
-from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig
 from nemo.collections.asr.metrics.wer import word_error_rate
 from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel
 from nemo.collections.asr.models.configs.asr_models_config import ASRDatasetConfig
+from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
 from nemo.core.config import TrainerConfig, hydra_runner
 from nemo.utils import logging
 from nemo.utils.get_rank import is_global_rank_zero

diff --git a/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py b/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py
@@ -63,8 +63,6 @@ def main(cfg):
 
     # If RTTM is provided and DER evaluation
     if diar_score is not None:
-        metric, mapping_dict, _ = diar_score
-
         # Get session-level diarization error rate and speaker counting error
         der_results = OfflineDiarWithASR.gather_eval_results(
             diar_score=diar_score,