From eb5983a6087e7cd31bfcc5e6cbb267d381f82ec0 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Tue, 25 Jul 2023 12:12:16 -0700 Subject: [PATCH] Disable distopt contiguous param buffer by default (#7095) Signed-off-by: Tim Moon --- .../nlp/models/language_modeling/megatron_base_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 3f541cfce14e..88281599ee82 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -66,7 +66,7 @@ class MegatronBaseModel(NLPModel): - Initialize the model parallel world for nemo. - Turn on all of the nvidia optimizations. - - If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the + - If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the correct size for tensor model parallelism. - If using distributed optimizer, configure to be compatible with O2 level optimizations and/or model parallelism. @@ -405,9 +405,8 @@ def setup_optimization( optim_kwargs = {} if optim_kwargs is None else optim_kwargs.copy() if self.with_distributed_adam: - # Allocate contiguous buffers to avoid extra copies + # Allocate contiguous buffer to avoid extra copies optim_kwargs['contiguous_grad_buffer'] = True - optim_kwargs['contiguous_param_buffer'] = True # Make sure optimizer state is in FP32 optim_dtype = torch.float32 @@ -507,7 +506,8 @@ def configure_optimizers(self): self._optimizer.init_params(reversed(no_overlap_params)) # Initialize contiguous parameter buffer - self._optimizer.init_param_buffer() + if self._optimizer.contiguous_param_buffer: + self._optimizer.init_param_buffer() if self._scheduler is None: return self._optimizer