From e5706a0e70be2cb6abfd5698ab5f7be40860127d Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Fri, 30 Jun 2023 11:35:57 -0600 Subject: [PATCH 1/4] upgrade base container (#6938) Signed-off-by: ericharper --- Dockerfile | 4 ++-- Jenkinsfile | 2 +- README.rst | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7722555357b2..3aa4c39d6a4d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.04-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.06-py3 # build an image that includes only the nemo dependencies, ensures that dependencies # are included first for optimal caching, and useful for building a development @@ -94,7 +94,7 @@ COPY . . # start building the final container FROM nemo-deps as nemo -ARG NEMO_VERSION=1.19.0 +ARG NEMO_VERSION=1.20.0 # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container # version information as runtime environment variable for introspection purposes diff --git a/Jenkinsfile b/Jenkinsfile index 1a79d87bcd38..be62291daf24 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,7 +1,7 @@ pipeline { agent { docker { - image 'nvcr.io/nvidia/pytorch:23.04-py3' + image 'nvcr.io/nvidia/pytorch:23.06-py3' args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=1' } } diff --git a/README.rst b/README.rst index 8a788da71550..7ac95b8cef70 100644 --- a/README.rst +++ b/README.rst @@ -319,13 +319,13 @@ To build a nemo container with Dockerfile from a branch, please run DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest . -If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.04-py3 and then installing from GitHub. +If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.06-py3 and then installing from GitHub. .. code-block:: bash docker run --gpus all -it --rm -v :/NeMo --shm-size=8g \ -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \ - stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.04-py3 + stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.06-py3 Examples -------- From b0e5bf3627dbcfb3f4a72d73d3c5e92184d8b1f6 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Fri, 30 Jun 2023 17:32:52 -0700 Subject: [PATCH 2/4] Fix requirements for pydantic + inflect (#6956) * Fix requirements for pydantic + inflect Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: smajumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- requirements/requirements_asr.txt | 1 - requirements/requirements_common.txt | 2 ++ requirements/requirements_nlp.txt | 1 - requirements/requirements_tts.txt | 1 - 4 files changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt index fdeaeb2d450d..011862ad723b 100644 --- a/requirements/requirements_asr.txt +++ b/requirements/requirements_asr.txt @@ -1,7 +1,6 @@ braceexpand editdistance g2p_en -inflect ipywidgets jiwer kaldi-python-io diff --git a/requirements/requirements_common.txt b/requirements/requirements_common.txt index 29d8ac4dd49b..a4d343a32d1a 100644 --- a/requirements/requirements_common.txt +++ b/requirements/requirements_common.txt @@ -1,4 +1,6 @@ +inflect pandas +pydantic<2 # remove after inflect supports Pydantic 2.0+ sacremoses>=0.0.43 sentencepiece<1.0.0 youtokentome>=1.0.5 diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 2018de6fbc31..68d8b8985748 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -7,7 +7,6 @@ ftfy gdown h5py ijson -inflect jieba markdown2 matplotlib>=3.3.2 diff --git a/requirements/requirements_tts.txt b/requirements/requirements_tts.txt index 20484871ee4b..bb330aaf2e58 100644 --- a/requirements/requirements_tts.txt +++ b/requirements/requirements_tts.txt @@ -1,6 +1,5 @@ attrdict einops -inflect jieba kornia librosa From 0b6e4e61bd23cbf9704dac431756d491adab084d Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Mon, 3 Jul 2023 08:01:35 -0700 Subject: [PATCH 3/4] Update distopt API for coalesced NCCL calls (#6886) * Update distopt API for coalesced NCCL calls Signed-off-by: Tim Moon * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update comment Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Dockerfile | 6 +++--- nemo/core/optim/distributed_adam.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3aa4c39d6a4d..2e6b617087bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,11 +45,11 @@ RUN apt-get update && \ WORKDIR /workspace/ WORKDIR /tmp/ -# TODO: Remove once this Apex commit (2/24/23) is included in PyTorch +# TODO: Remove once this Apex commit (5/12/23) is included in PyTorch # container RUN git clone https://github.com/NVIDIA/apex.git && \ cd apex && \ - git checkout 57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2 && \ + git checkout 8b7a1ff183741dd8f9b87e7bafd04cfde99cea28 && \ pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./ # uninstall stuff from base container @@ -75,7 +75,7 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec # install flash attention dependencies RUN pip install flash-attn # pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3 -RUN pip install triton==2.0.0.dev20221202 +RUN pip install triton==2.0.0.dev20221202 # install k2, skip if installation fails COPY scripts /tmp/nemo/scripts/ diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py index 1f2ce90f3ff7..8c3b0a30658f 100644 --- a/nemo/core/optim/distributed_adam.py +++ b/nemo/core/optim/distributed_adam.py @@ -19,6 +19,7 @@ from apex.contrib.optimizers.distributed_fused_adam import ( DistributedFusedAdam, _coalescing_manager, + _coalescing_manager_append_work, _disable_pre_forward_hook, ) from megatron.core import parallel_state @@ -173,16 +174,15 @@ def _fp32_optim_grad_sync(self): for model_param, main_param in self._fp32_optim_main_params.items(): if model_param.grad is not None: main_param.grad += model_param.grad.detach() - sync_requests = [] - with _coalescing_manager(self.process_group, self.device, sync_requests): + with _coalescing_manager(self.process_group, self.device, async_ops=True) as cm: for main_param in self._fp32_optim_main_params.values(): - sync_requests.append( + _coalescing_manager_append_work( + cm, torch.distributed.all_reduce( main_param.grad, op=torch.distributed.ReduceOp.AVG, group=self.process_group, async_op=True, - ) + ), ) - for req in sync_requests: - req.wait() + cm.wait() self._fp32_optim_grad_sync_needed = False def zero_grad(self, *args, **kwargs): From 17447184bdf026b2f88d81353998856170bc09bc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 5 Jul 2023 14:13:13 -0700 Subject: [PATCH 4/4] Remove `compute_on_step` from metrics (#6979) (#6981) * Remove `compute_on_step` from metrics * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove confusing log message * Update tests --------- Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/collections/asr/metrics/rnnt_wer.py | 2 +- nemo/collections/asr/metrics/rnnt_wer_bpe.py | 2 +- nemo/collections/asr/metrics/wer.py | 2 +- nemo/collections/asr/metrics/wer_bpe.py | 2 +- .../common/metrics/global_average_loss_metric.py | 9 ++------- nemo/collections/common/metrics/perplexity.py | 8 ++------ nemo/collections/nlp/metrics/sequence_perplexity.py | 9 ++------- .../nlp/models/language_modeling/bert_lm_model.py | 2 +- .../nlp/models/text2sparql/text2sparql_model.py | 2 +- nemo/core/optim/optimizers.py | 1 - tests/collections/common/pl_utils.py | 8 +++----- 11 files changed, 15 insertions(+), 32 deletions(-) diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py index 55f9f4b5ea9f..7e5636191a1d 100644 --- a/nemo/collections/asr/metrics/rnnt_wer.py +++ b/nemo/collections/asr/metrics/rnnt_wer.py @@ -1224,7 +1224,7 @@ def validation_epoch_end(self, outputs): def __init__( self, decoding: RNNTDecoding, batch_dim_index=0, use_cer=False, log_prediction=True, dist_sync_on_step=False ): - super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False) + super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step) self.decoding = decoding self.batch_dim_index = batch_dim_index self.use_cer = use_cer diff --git a/nemo/collections/asr/metrics/rnnt_wer_bpe.py b/nemo/collections/asr/metrics/rnnt_wer_bpe.py index 0870eb180776..d2e2c3cc5923 100644 --- a/nemo/collections/asr/metrics/rnnt_wer_bpe.py +++ b/nemo/collections/asr/metrics/rnnt_wer_bpe.py @@ -359,7 +359,7 @@ def __init__( log_prediction: bool = True, dist_sync_on_step=False, ): - super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False) + super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step) self.decoding = decoding self.batch_dim_index = batch_dim_index self.use_cer = use_cer diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index 7f7f853d307d..4d90810cc3df 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -1125,7 +1125,7 @@ def __init__( fold_consecutive=True, dist_sync_on_step=False, ): - super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False) + super().__init__(dist_sync_on_step=dist_sync_on_step) self.decoding = decoding self.use_cer = use_cer diff --git a/nemo/collections/asr/metrics/wer_bpe.py b/nemo/collections/asr/metrics/wer_bpe.py index 762acf172a16..8a92e4745a1b 100644 --- a/nemo/collections/asr/metrics/wer_bpe.py +++ b/nemo/collections/asr/metrics/wer_bpe.py @@ -247,7 +247,7 @@ def __init__( fold_consecutive=True, dist_sync_on_step=False, ): - super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False) + super().__init__(dist_sync_on_step=dist_sync_on_step) self.decoding = decoding self.tokenizer = self.decoding.tokenizer self.blank_id = self.decoding.tokenizer.tokenizer.vocab_size diff --git a/nemo/collections/common/metrics/global_average_loss_metric.py b/nemo/collections/common/metrics/global_average_loss_metric.py index fae1dbfea5e8..3bbd4d13abf4 100644 --- a/nemo/collections/common/metrics/global_average_loss_metric.py +++ b/nemo/collections/common/metrics/global_average_loss_metric.py @@ -28,9 +28,6 @@ class GlobalAverageLossMetric(Metric): See :doc:`PyTorch Lightning Metrics` for the metric usage instruction. Args: - compute_on_step: - The method :meth:`forward` only calls ``update()`` and returns ``None`` if this is set to ``False``. - default: ``True`` dist_sync_on_step: Synchronize metric state across processes at each method :meth:`forward` call before returning the value at the step @@ -44,10 +41,8 @@ class GlobalAverageLossMetric(Metric): full_state_update = True - def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, take_avg_loss=True): - super().__init__( - compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group - ) + def __init__(self, dist_sync_on_step=False, process_group=None, take_avg_loss=True): + super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group) self.add_state("loss_sum", torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum') self.add_state("num_measurements", torch.tensor(0, dtype=torch.int64), dist_reduce_fx='sum') self.take_avg_loss = take_avg_loss diff --git a/nemo/collections/common/metrics/perplexity.py b/nemo/collections/common/metrics/perplexity.py index 1158e3408611..9e1c21737ec8 100644 --- a/nemo/collections/common/metrics/perplexity.py +++ b/nemo/collections/common/metrics/perplexity.py @@ -29,8 +29,6 @@ class Perplexity(Metric): See `PyTorch Lightning Metrics `_ for the metric usage instructions. Args: - compute_on_step: - Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True`` dist_sync_on_step: Synchronize metric state across processes at each ``forward()`` before returning the value at the step. @@ -44,10 +42,8 @@ class Perplexity(Metric): full_state_update = True - def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, validate_args=True): - super().__init__( - compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group - ) + def __init__(self, dist_sync_on_step=False, process_group=None, validate_args=True): + super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group) self.validate_args = validate_args self.add_state('perplexities_sum', torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum') # Total number of distributions seen since last reset diff --git a/nemo/collections/nlp/metrics/sequence_perplexity.py b/nemo/collections/nlp/metrics/sequence_perplexity.py index 688f9db87ea6..339f062f7cc1 100644 --- a/nemo/collections/nlp/metrics/sequence_perplexity.py +++ b/nemo/collections/nlp/metrics/sequence_perplexity.py @@ -31,8 +31,6 @@ class SequencePerplexity(Metric): See :doc:`PyTorch Lightning Metrics` for the metric usage instructions. Args: - compute_on_step: - Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True`` dist_sync_on_step: Synchronize metric state across processes at each ``forward()`` before returning the value at the step. process_group: @@ -43,12 +41,9 @@ class SequencePerplexity(Metric): to perform the allgather. """ - def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, dist_sync_fn=None): + def __init__(self, dist_sync_on_step=False, process_group=None, dist_sync_fn=None): super().__init__( - compute_on_step=compute_on_step, - dist_sync_on_step=dist_sync_on_step, - process_group=process_group, - dist_sync_fn=dist_sync_fn, + dist_sync_on_step=dist_sync_on_step, process_group=process_group, dist_sync_fn=dist_sync_fn, ) # Total sum of exponentiated average negative log likelihoods diff --git a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py index 4c9d43c20d54..5cf509e77846 100644 --- a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py +++ b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py @@ -116,7 +116,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): # create extra bias # setup to track metrics - self.validation_perplexity = Perplexity(compute_on_step=False) + self.validation_perplexity = Perplexity() self.setup_optimization(cfg.optim) diff --git a/nemo/collections/nlp/models/text2sparql/text2sparql_model.py b/nemo/collections/nlp/models/text2sparql/text2sparql_model.py index 5290209b0c95..50046aef0344 100644 --- a/nemo/collections/nlp/models/text2sparql/text2sparql_model.py +++ b/nemo/collections/nlp/models/text2sparql/text2sparql_model.py @@ -100,7 +100,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): decoder=cfg.language_model.pretrained_decoder_model_name, ) - self.validation_perplexity = Perplexity(compute_on_step=False) + self.validation_perplexity = Perplexity() self.setup_optimization(cfg.optim) diff --git a/nemo/core/optim/optimizers.py b/nemo/core/optim/optimizers.py index 76e47e20e0cc..9473ef0af969 100644 --- a/nemo/core/optim/optimizers.py +++ b/nemo/core/optim/optimizers.py @@ -51,7 +51,6 @@ AVAILABLE_OPTIMIZERS['fused_adam'] = FusedAdam except ModuleNotFoundError: HAVE_APEX = False - logging.warning("Apex was not found. Using the lamb or fused_adam optimizer will error out.") HAVE_APEX_DISTRIBUTED_ADAM = False if HAVE_APEX: diff --git a/tests/collections/common/pl_utils.py b/tests/collections/common/pl_utils.py index 395c8cef5969..a2e9609c8492 100644 --- a/tests/collections/common/pl_utils.py +++ b/tests/collections/common/pl_utils.py @@ -90,7 +90,7 @@ def _class_test( calculated across devices for each batch (and not just at the end) """ # Instanciate lightning metric - metric = metric_class(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args) + metric = metric_class(dist_sync_on_step=dist_sync_on_step, **metric_args) # verify metrics work after being loaded from pickled state pickled_metric = pickle.dumps(metric) @@ -303,7 +303,7 @@ def _perplexity_class_test( calculated across devices for each batch (and not just at the end) """ # Instanciate lightning metric - perplexity = Perplexity(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args) + perplexity = Perplexity(dist_sync_on_step=dist_sync_on_step, **metric_args) if (probs is None) == (logits is None): with pytest.raises(ValueError): perplexity(probs, logits) @@ -464,9 +464,7 @@ def _loss_class_test( calculated across devices for each batch (and not just at the end) """ # Instantiate lightning metric - loss_metric = GlobalAverageLossMetric( - compute_on_step=True, dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss - ) + loss_metric = GlobalAverageLossMetric(dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss) # verify loss works after being loaded from pickled state pickled_metric = pickle.dumps(loss_metric)