Merge branch 'main' into asr_with_tts_tutorial

NVIDIA · Jul 6, 2023 · 641c2f8 · 641c2f8
2 parents 0894b32 + 1744718
commit 641c2f8
Show file tree

Hide file tree

Showing 19 changed files with 31 additions and 49 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.04-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.06-py3
 
 # build an image that includes only the nemo dependencies, ensures that dependencies
 # are included first for optimal caching, and useful for building a development
@@ -45,11 +45,11 @@ RUN apt-get update && \
 WORKDIR /workspace/
 
 WORKDIR /tmp/
-# TODO: Remove once this Apex commit (2/24/23) is included in PyTorch
+# TODO: Remove once this Apex commit (5/12/23) is included in PyTorch
 # container
 RUN git clone https://github.com/NVIDIA/apex.git && \
   cd apex && \
-  git checkout 57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2 && \
+  git checkout 8b7a1ff183741dd8f9b87e7bafd04cfde99cea28 && \
   pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
 
 # uninstall stuff from base container
@@ -75,7 +75,7 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
 # install flash attention dependencies
 RUN pip install flash-attn
 # pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3
-RUN pip install triton==2.0.0.dev20221202 
+RUN pip install triton==2.0.0.dev20221202
 
 # install k2, skip if installation fails
 COPY scripts /tmp/nemo/scripts/
@@ -94,7 +94,7 @@ COPY . .
 
 # start building the final container
 FROM nemo-deps as nemo
-ARG NEMO_VERSION=1.19.0
+ARG NEMO_VERSION=1.20.0
 
 # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container
 # version information as runtime environment variable for introspection purposes

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1,7 +1,7 @@
 pipeline {
   agent {
         docker {
-          image 'nvcr.io/nvidia/pytorch:23.04-py3'
+          image 'nvcr.io/nvidia/pytorch:23.06-py3'
           args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=1'
         }
   }

diff --git a/README.rst b/README.rst
@@ -319,13 +319,13 @@ To build a nemo container with Dockerfile from a branch, please run
     DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
 
 
-If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.04-py3 and then installing from GitHub.
+If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.06-py3 and then installing from GitHub.
 
 .. code-block:: bash
 
     docker run --gpus all -it --rm -v <nemo_github_folder>:/NeMo --shm-size=8g \
     -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
-    stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.04-py3
+    stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.06-py3
 
 Examples
 --------

diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py
@@ -1224,7 +1224,7 @@ def validation_epoch_end(self, outputs):
     def __init__(
         self, decoding: RNNTDecoding, batch_dim_index=0, use_cer=False, log_prediction=True, dist_sync_on_step=False
     ):
-        super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
+        super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step)
         self.decoding = decoding
         self.batch_dim_index = batch_dim_index
         self.use_cer = use_cer

diff --git a/nemo/collections/asr/metrics/rnnt_wer_bpe.py b/nemo/collections/asr/metrics/rnnt_wer_bpe.py
@@ -359,7 +359,7 @@ def __init__(
         log_prediction: bool = True,
         dist_sync_on_step=False,
     ):
-        super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
+        super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step)
         self.decoding = decoding
         self.batch_dim_index = batch_dim_index
         self.use_cer = use_cer

diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py
@@ -1125,7 +1125,7 @@ def __init__(
         fold_consecutive=True,
         dist_sync_on_step=False,
     ):
-        super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
 
         self.decoding = decoding
         self.use_cer = use_cer

diff --git a/nemo/collections/asr/metrics/wer_bpe.py b/nemo/collections/asr/metrics/wer_bpe.py
@@ -247,7 +247,7 @@ def __init__(
         fold_consecutive=True,
         dist_sync_on_step=False,
     ):
-        super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
         self.decoding = decoding
         self.tokenizer = self.decoding.tokenizer
         self.blank_id = self.decoding.tokenizer.tokenizer.vocab_size

diff --git a/nemo/collections/common/metrics/global_average_loss_metric.py b/nemo/collections/common/metrics/global_average_loss_metric.py
@@ -28,9 +28,6 @@ class GlobalAverageLossMetric(Metric):
     See :doc:`PyTorch Lightning Metrics<pytorch-lightning:metrics>` for the metric usage instruction.
 
     Args:
-        compute_on_step:
-            The method :meth:`forward` only calls ``update()`` and returns ``None`` if this is set to ``False``.
-            default: ``True``
         dist_sync_on_step:
             Synchronize metric state across processes at each method :meth:`forward` call before returning the
             value at the step
@@ -44,10 +41,8 @@ class GlobalAverageLossMetric(Metric):
 
     full_state_update = True
 
-    def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, take_avg_loss=True):
-        super().__init__(
-            compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group
-        )
+    def __init__(self, dist_sync_on_step=False, process_group=None, take_avg_loss=True):
+        super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group)
         self.add_state("loss_sum", torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum')
         self.add_state("num_measurements", torch.tensor(0, dtype=torch.int64), dist_reduce_fx='sum')
         self.take_avg_loss = take_avg_loss

diff --git a/nemo/collections/common/metrics/perplexity.py b/nemo/collections/common/metrics/perplexity.py
@@ -29,8 +29,6 @@ class Perplexity(Metric):
     See `PyTorch Lightning Metrics <https://pytorch-lightning.readthedocs.io/en/stable/ecosystem/metrics.html>`_ for the metric usage instructions.
 
     Args:
-        compute_on_step:
-            Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True``
         dist_sync_on_step:
             Synchronize metric state across processes at each ``forward()``
             before returning the value at the step.
@@ -44,10 +42,8 @@ class Perplexity(Metric):
 
     full_state_update = True
 
-    def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, validate_args=True):
-        super().__init__(
-            compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group
-        )
+    def __init__(self, dist_sync_on_step=False, process_group=None, validate_args=True):
+        super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group)
         self.validate_args = validate_args
         self.add_state('perplexities_sum', torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum')
         # Total number of distributions seen since last reset

diff --git a/nemo/collections/nlp/metrics/sequence_perplexity.py b/nemo/collections/nlp/metrics/sequence_perplexity.py
@@ -31,8 +31,6 @@ class SequencePerplexity(Metric):
     See :doc:`PyTorch Lightning Metrics<pytorch-lightning:metrics>` for the metric usage instructions.
 
     Args:
-        compute_on_step:
-            Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True``
         dist_sync_on_step:
             Synchronize metric state across processes at each ``forward()`` before returning the value at the step.
         process_group:
@@ -43,12 +41,9 @@ class SequencePerplexity(Metric):
                 to perform the allgather.
     """
 
-    def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, dist_sync_fn=None):
+    def __init__(self, dist_sync_on_step=False, process_group=None, dist_sync_fn=None):
         super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
+            dist_sync_on_step=dist_sync_on_step, process_group=process_group, dist_sync_fn=dist_sync_fn,
         )
 
         # Total sum of exponentiated average negative log likelihoods

diff --git a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py
@@ -116,7 +116,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # create extra bias
 
         # setup to track metrics
-        self.validation_perplexity = Perplexity(compute_on_step=False)
+        self.validation_perplexity = Perplexity()
 
         self.setup_optimization(cfg.optim)
 

diff --git a/nemo/collections/nlp/models/text2sparql/text2sparql_model.py b/nemo/collections/nlp/models/text2sparql/text2sparql_model.py
@@ -100,7 +100,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
                 decoder=cfg.language_model.pretrained_decoder_model_name,
             )
 
-        self.validation_perplexity = Perplexity(compute_on_step=False)
+        self.validation_perplexity = Perplexity()
 
         self.setup_optimization(cfg.optim)
 

diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
@@ -19,6 +19,7 @@
 from apex.contrib.optimizers.distributed_fused_adam import (
     DistributedFusedAdam,
     _coalescing_manager,
+    _coalescing_manager_append_work,
     _disable_pre_forward_hook,
 )
 from megatron.core import parallel_state
@@ -173,16 +174,15 @@ def _fp32_optim_grad_sync(self):
         for model_param, main_param in self._fp32_optim_main_params.items():
             if model_param.grad is not None:
                 main_param.grad += model_param.grad.detach()
-        sync_requests = []
-        with _coalescing_manager(self.process_group, self.device, sync_requests):
+        with _coalescing_manager(self.process_group, self.device, async_ops=True) as cm:
             for main_param in self._fp32_optim_main_params.values():
-                sync_requests.append(
+                _coalescing_manager_append_work(
+                    cm,
                     torch.distributed.all_reduce(
                         main_param.grad, op=torch.distributed.ReduceOp.AVG, group=self.process_group, async_op=True,
-                    )
+                    ),
                 )
-        for req in sync_requests:
-            req.wait()
+        cm.wait()
         self._fp32_optim_grad_sync_needed = False
 
     def zero_grad(self, *args, **kwargs):

diff --git a/nemo/core/optim/optimizers.py b/nemo/core/optim/optimizers.py
@@ -51,7 +51,6 @@
     AVAILABLE_OPTIMIZERS['fused_adam'] = FusedAdam
 except ModuleNotFoundError:
     HAVE_APEX = False
-    logging.warning("Apex was not found. Using the lamb or fused_adam optimizer will error out.")
 
 HAVE_APEX_DISTRIBUTED_ADAM = False
 if HAVE_APEX:

diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt
@@ -1,7 +1,6 @@
 braceexpand
 editdistance
 g2p_en
-inflect
 ipywidgets
 jiwer
 kaldi-python-io

diff --git a/requirements/requirements_common.txt b/requirements/requirements_common.txt
@@ -1,4 +1,6 @@
+inflect
 pandas
+pydantic<2  # remove after inflect supports Pydantic 2.0+
 sacremoses>=0.0.43
 sentencepiece<1.0.0
 youtokentome>=1.0.5
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
@@ -7,7 +7,6 @@ ftfy
 gdown
 h5py
 ijson
-inflect
 jieba
 markdown2
 matplotlib>=3.3.2

diff --git a/requirements/requirements_tts.txt b/requirements/requirements_tts.txt
@@ -1,6 +1,5 @@
 attrdict
 einops
-inflect
 jieba
 kornia
 librosa

diff --git a/tests/collections/common/pl_utils.py b/tests/collections/common/pl_utils.py
@@ -90,7 +90,7 @@ def _class_test(
                 calculated across devices for each batch (and not just at the end)
     """
     # Instanciate lightning metric
-    metric = metric_class(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args)
+    metric = metric_class(dist_sync_on_step=dist_sync_on_step, **metric_args)
 
     # verify metrics work after being loaded from pickled state
     pickled_metric = pickle.dumps(metric)
@@ -303,7 +303,7 @@ def _perplexity_class_test(
                 calculated across devices for each batch (and not just at the end)
     """
     # Instanciate lightning metric
-    perplexity = Perplexity(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args)
+    perplexity = Perplexity(dist_sync_on_step=dist_sync_on_step, **metric_args)
     if (probs is None) == (logits is None):
         with pytest.raises(ValueError):
             perplexity(probs, logits)
@@ -464,9 +464,7 @@ def _loss_class_test(
                 calculated across devices for each batch (and not just at the end)
     """
     # Instantiate lightning metric
-    loss_metric = GlobalAverageLossMetric(
-        compute_on_step=True, dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss
-    )
+    loss_metric = GlobalAverageLossMetric(dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss)
 
     # verify loss works after being loaded from pickled state
     pickled_metric = pickle.dumps(loss_metric)