From e5706a0e70be2cb6abfd5698ab5f7be40860127d Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Fri, 30 Jun 2023 11:35:57 -0600
Subject: [PATCH 1/4] upgrade base container (#6938)

Signed-off-by: ericharper <complex451@gmail.com>
---
 Dockerfile  | 4 ++--
 Jenkinsfile | 2 +-
 README.rst  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7722555357b2..3aa4c39d6a4d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.04-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.06-py3
 
 # build an image that includes only the nemo dependencies, ensures that dependencies
 # are included first for optimal caching, and useful for building a development
@@ -94,7 +94,7 @@ COPY . .
 
 # start building the final container
 FROM nemo-deps as nemo
-ARG NEMO_VERSION=1.19.0
+ARG NEMO_VERSION=1.20.0
 
 # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container
 # version information as runtime environment variable for introspection purposes
diff --git a/Jenkinsfile b/Jenkinsfile
index 1a79d87bcd38..be62291daf24 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,7 +1,7 @@
 pipeline {
   agent {
         docker {
-          image 'nvcr.io/nvidia/pytorch:23.04-py3'
+          image 'nvcr.io/nvidia/pytorch:23.06-py3'
           args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=1'
         }
   }
diff --git a/README.rst b/README.rst
index 8a788da71550..7ac95b8cef70 100644
--- a/README.rst
+++ b/README.rst
@@ -319,13 +319,13 @@ To build a nemo container with Dockerfile from a branch, please run
     DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
 
 
-If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.04-py3 and then installing from GitHub.
+If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.06-py3 and then installing from GitHub.
 
 .. code-block:: bash
 
     docker run --gpus all -it --rm -v <nemo_github_folder>:/NeMo --shm-size=8g \
     -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
-    stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.04-py3
+    stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.06-py3
 
 Examples
 --------

From b0e5bf3627dbcfb3f4a72d73d3c5e92184d8b1f6 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Fri, 30 Jun 2023 17:32:52 -0700
Subject: [PATCH 2/4] Fix requirements for pydantic + inflect (#6956)

* Fix requirements for pydantic + inflect

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 requirements/requirements_asr.txt    | 1 -
 requirements/requirements_common.txt | 2 ++
 requirements/requirements_nlp.txt    | 1 -
 requirements/requirements_tts.txt    | 1 -
 4 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt
index fdeaeb2d450d..011862ad723b 100644
--- a/requirements/requirements_asr.txt
+++ b/requirements/requirements_asr.txt
@@ -1,7 +1,6 @@
 braceexpand
 editdistance
 g2p_en
-inflect
 ipywidgets
 jiwer
 kaldi-python-io
diff --git a/requirements/requirements_common.txt b/requirements/requirements_common.txt
index 29d8ac4dd49b..a4d343a32d1a 100644
--- a/requirements/requirements_common.txt
+++ b/requirements/requirements_common.txt
@@ -1,4 +1,6 @@
+inflect
 pandas
+pydantic<2  # remove after inflect supports Pydantic 2.0+
 sacremoses>=0.0.43
 sentencepiece<1.0.0
 youtokentome>=1.0.5
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 2018de6fbc31..68d8b8985748 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -7,7 +7,6 @@ ftfy
 gdown
 h5py
 ijson
-inflect
 jieba
 markdown2
 matplotlib>=3.3.2
diff --git a/requirements/requirements_tts.txt b/requirements/requirements_tts.txt
index 20484871ee4b..bb330aaf2e58 100644
--- a/requirements/requirements_tts.txt
+++ b/requirements/requirements_tts.txt
@@ -1,6 +1,5 @@
 attrdict
 einops
-inflect
 jieba
 kornia
 librosa

From 0b6e4e61bd23cbf9704dac431756d491adab084d Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Mon, 3 Jul 2023 08:01:35 -0700
Subject: [PATCH 3/4] Update distopt API for coalesced NCCL calls (#6886)

* Update distopt API for coalesced NCCL calls

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update comment

Signed-off-by: Tim Moon <tmoon@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 Dockerfile                          |  6 +++---
 nemo/core/optim/distributed_adam.py | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3aa4c39d6a4d..2e6b617087bc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,11 +45,11 @@ RUN apt-get update && \
 WORKDIR /workspace/
 
 WORKDIR /tmp/
-# TODO: Remove once this Apex commit (2/24/23) is included in PyTorch
+# TODO: Remove once this Apex commit (5/12/23) is included in PyTorch
 # container
 RUN git clone https://github.com/NVIDIA/apex.git && \
   cd apex && \
-  git checkout 57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2 && \
+  git checkout 8b7a1ff183741dd8f9b87e7bafd04cfde99cea28 && \
   pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
 
 # uninstall stuff from base container
@@ -75,7 +75,7 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
 # install flash attention dependencies
 RUN pip install flash-attn
 # pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3
-RUN pip install triton==2.0.0.dev20221202 
+RUN pip install triton==2.0.0.dev20221202
 
 # install k2, skip if installation fails
 COPY scripts /tmp/nemo/scripts/
diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index 1f2ce90f3ff7..8c3b0a30658f 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -19,6 +19,7 @@
 from apex.contrib.optimizers.distributed_fused_adam import (
     DistributedFusedAdam,
     _coalescing_manager,
+    _coalescing_manager_append_work,
     _disable_pre_forward_hook,
 )
 from megatron.core import parallel_state
@@ -173,16 +174,15 @@ def _fp32_optim_grad_sync(self):
         for model_param, main_param in self._fp32_optim_main_params.items():
             if model_param.grad is not None:
                 main_param.grad += model_param.grad.detach()
-        sync_requests = []
-        with _coalescing_manager(self.process_group, self.device, sync_requests):
+        with _coalescing_manager(self.process_group, self.device, async_ops=True) as cm:
             for main_param in self._fp32_optim_main_params.values():
-                sync_requests.append(
+                _coalescing_manager_append_work(
+                    cm,
                     torch.distributed.all_reduce(
                         main_param.grad, op=torch.distributed.ReduceOp.AVG, group=self.process_group, async_op=True,
-                    )
+                    ),
                 )
-        for req in sync_requests:
-            req.wait()
+        cm.wait()
         self._fp32_optim_grad_sync_needed = False
 
     def zero_grad(self, *args, **kwargs):

From 17447184bdf026b2f88d81353998856170bc09bc Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Jul 2023 14:13:13 -0700
Subject: [PATCH 4/4] Remove `compute_on_step` from metrics (#6979) (#6981)

* Remove `compute_on_step` from metrics


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove confusing log message


* Update tests


---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/collections/asr/metrics/rnnt_wer.py                 | 2 +-
 nemo/collections/asr/metrics/rnnt_wer_bpe.py             | 2 +-
 nemo/collections/asr/metrics/wer.py                      | 2 +-
 nemo/collections/asr/metrics/wer_bpe.py                  | 2 +-
 .../common/metrics/global_average_loss_metric.py         | 9 ++-------
 nemo/collections/common/metrics/perplexity.py            | 8 ++------
 nemo/collections/nlp/metrics/sequence_perplexity.py      | 9 ++-------
 .../nlp/models/language_modeling/bert_lm_model.py        | 2 +-
 .../nlp/models/text2sparql/text2sparql_model.py          | 2 +-
 nemo/core/optim/optimizers.py                            | 1 -
 tests/collections/common/pl_utils.py                     | 8 +++-----
 11 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py
index 55f9f4b5ea9f..7e5636191a1d 100644
--- a/nemo/collections/asr/metrics/rnnt_wer.py
+++ b/nemo/collections/asr/metrics/rnnt_wer.py
@@ -1224,7 +1224,7 @@ def validation_epoch_end(self, outputs):
     def __init__(
         self, decoding: RNNTDecoding, batch_dim_index=0, use_cer=False, log_prediction=True, dist_sync_on_step=False
     ):
-        super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
+        super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step)
         self.decoding = decoding
         self.batch_dim_index = batch_dim_index
         self.use_cer = use_cer
diff --git a/nemo/collections/asr/metrics/rnnt_wer_bpe.py b/nemo/collections/asr/metrics/rnnt_wer_bpe.py
index 0870eb180776..d2e2c3cc5923 100644
--- a/nemo/collections/asr/metrics/rnnt_wer_bpe.py
+++ b/nemo/collections/asr/metrics/rnnt_wer_bpe.py
@@ -359,7 +359,7 @@ def __init__(
         log_prediction: bool = True,
         dist_sync_on_step=False,
     ):
-        super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
+        super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step)
         self.decoding = decoding
         self.batch_dim_index = batch_dim_index
         self.use_cer = use_cer
diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py
index 7f7f853d307d..4d90810cc3df 100644
--- a/nemo/collections/asr/metrics/wer.py
+++ b/nemo/collections/asr/metrics/wer.py
@@ -1125,7 +1125,7 @@ def __init__(
         fold_consecutive=True,
         dist_sync_on_step=False,
     ):
-        super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
 
         self.decoding = decoding
         self.use_cer = use_cer
diff --git a/nemo/collections/asr/metrics/wer_bpe.py b/nemo/collections/asr/metrics/wer_bpe.py
index 762acf172a16..8a92e4745a1b 100644
--- a/nemo/collections/asr/metrics/wer_bpe.py
+++ b/nemo/collections/asr/metrics/wer_bpe.py
@@ -247,7 +247,7 @@ def __init__(
         fold_consecutive=True,
         dist_sync_on_step=False,
     ):
-        super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
         self.decoding = decoding
         self.tokenizer = self.decoding.tokenizer
         self.blank_id = self.decoding.tokenizer.tokenizer.vocab_size
diff --git a/nemo/collections/common/metrics/global_average_loss_metric.py b/nemo/collections/common/metrics/global_average_loss_metric.py
index fae1dbfea5e8..3bbd4d13abf4 100644
--- a/nemo/collections/common/metrics/global_average_loss_metric.py
+++ b/nemo/collections/common/metrics/global_average_loss_metric.py
@@ -28,9 +28,6 @@ class GlobalAverageLossMetric(Metric):
     See :doc:`PyTorch Lightning Metrics<pytorch-lightning:metrics>` for the metric usage instruction.
 
     Args:
-        compute_on_step:
-            The method :meth:`forward` only calls ``update()`` and returns ``None`` if this is set to ``False``.
-            default: ``True``
         dist_sync_on_step:
             Synchronize metric state across processes at each method :meth:`forward` call before returning the
             value at the step
@@ -44,10 +41,8 @@ class GlobalAverageLossMetric(Metric):
 
     full_state_update = True
 
-    def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, take_avg_loss=True):
-        super().__init__(
-            compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group
-        )
+    def __init__(self, dist_sync_on_step=False, process_group=None, take_avg_loss=True):
+        super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group)
         self.add_state("loss_sum", torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum')
         self.add_state("num_measurements", torch.tensor(0, dtype=torch.int64), dist_reduce_fx='sum')
         self.take_avg_loss = take_avg_loss
diff --git a/nemo/collections/common/metrics/perplexity.py b/nemo/collections/common/metrics/perplexity.py
index 1158e3408611..9e1c21737ec8 100644
--- a/nemo/collections/common/metrics/perplexity.py
+++ b/nemo/collections/common/metrics/perplexity.py
@@ -29,8 +29,6 @@ class Perplexity(Metric):
     See `PyTorch Lightning Metrics <https://pytorch-lightning.readthedocs.io/en/stable/ecosystem/metrics.html>`_ for the metric usage instructions.
 
     Args:
-        compute_on_step:
-            Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True``
         dist_sync_on_step:
             Synchronize metric state across processes at each ``forward()``
             before returning the value at the step.
@@ -44,10 +42,8 @@ class Perplexity(Metric):
 
     full_state_update = True
 
-    def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, validate_args=True):
-        super().__init__(
-            compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group
-        )
+    def __init__(self, dist_sync_on_step=False, process_group=None, validate_args=True):
+        super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group)
         self.validate_args = validate_args
         self.add_state('perplexities_sum', torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum')
         # Total number of distributions seen since last reset
diff --git a/nemo/collections/nlp/metrics/sequence_perplexity.py b/nemo/collections/nlp/metrics/sequence_perplexity.py
index 688f9db87ea6..339f062f7cc1 100644
--- a/nemo/collections/nlp/metrics/sequence_perplexity.py
+++ b/nemo/collections/nlp/metrics/sequence_perplexity.py
@@ -31,8 +31,6 @@ class SequencePerplexity(Metric):
     See :doc:`PyTorch Lightning Metrics<pytorch-lightning:metrics>` for the metric usage instructions.
 
     Args:
-        compute_on_step:
-            Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True``
         dist_sync_on_step:
             Synchronize metric state across processes at each ``forward()`` before returning the value at the step.
         process_group:
@@ -43,12 +41,9 @@ class SequencePerplexity(Metric):
                 to perform the allgather.
     """
 
-    def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, dist_sync_fn=None):
+    def __init__(self, dist_sync_on_step=False, process_group=None, dist_sync_fn=None):
         super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
+            dist_sync_on_step=dist_sync_on_step, process_group=process_group, dist_sync_fn=dist_sync_fn,
         )
 
         # Total sum of exponentiated average negative log likelihoods
diff --git a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py
index 4c9d43c20d54..5cf509e77846 100644
--- a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py
+++ b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py
@@ -116,7 +116,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # create extra bias
 
         # setup to track metrics
-        self.validation_perplexity = Perplexity(compute_on_step=False)
+        self.validation_perplexity = Perplexity()
 
         self.setup_optimization(cfg.optim)
 
diff --git a/nemo/collections/nlp/models/text2sparql/text2sparql_model.py b/nemo/collections/nlp/models/text2sparql/text2sparql_model.py
index 5290209b0c95..50046aef0344 100644
--- a/nemo/collections/nlp/models/text2sparql/text2sparql_model.py
+++ b/nemo/collections/nlp/models/text2sparql/text2sparql_model.py
@@ -100,7 +100,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
                 decoder=cfg.language_model.pretrained_decoder_model_name,
             )
 
-        self.validation_perplexity = Perplexity(compute_on_step=False)
+        self.validation_perplexity = Perplexity()
 
         self.setup_optimization(cfg.optim)
 
diff --git a/nemo/core/optim/optimizers.py b/nemo/core/optim/optimizers.py
index 76e47e20e0cc..9473ef0af969 100644
--- a/nemo/core/optim/optimizers.py
+++ b/nemo/core/optim/optimizers.py
@@ -51,7 +51,6 @@
     AVAILABLE_OPTIMIZERS['fused_adam'] = FusedAdam
 except ModuleNotFoundError:
     HAVE_APEX = False
-    logging.warning("Apex was not found. Using the lamb or fused_adam optimizer will error out.")
 
 HAVE_APEX_DISTRIBUTED_ADAM = False
 if HAVE_APEX:
diff --git a/tests/collections/common/pl_utils.py b/tests/collections/common/pl_utils.py
index 395c8cef5969..a2e9609c8492 100644
--- a/tests/collections/common/pl_utils.py
+++ b/tests/collections/common/pl_utils.py
@@ -90,7 +90,7 @@ def _class_test(
                 calculated across devices for each batch (and not just at the end)
     """
     # Instanciate lightning metric
-    metric = metric_class(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args)
+    metric = metric_class(dist_sync_on_step=dist_sync_on_step, **metric_args)
 
     # verify metrics work after being loaded from pickled state
     pickled_metric = pickle.dumps(metric)
@@ -303,7 +303,7 @@ def _perplexity_class_test(
                 calculated across devices for each batch (and not just at the end)
     """
     # Instanciate lightning metric
-    perplexity = Perplexity(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args)
+    perplexity = Perplexity(dist_sync_on_step=dist_sync_on_step, **metric_args)
     if (probs is None) == (logits is None):
         with pytest.raises(ValueError):
             perplexity(probs, logits)
@@ -464,9 +464,7 @@ def _loss_class_test(
                 calculated across devices for each batch (and not just at the end)
     """
     # Instantiate lightning metric
-    loss_metric = GlobalAverageLossMetric(
-        compute_on_step=True, dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss
-    )
+    loss_metric = GlobalAverageLossMetric(dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss)
 
     # verify loss works after being loaded from pickled state
     pickled_metric = pickle.dumps(loss_metric)