diff --git a/Dockerfile b/Dockerfile index 0c3d56a0a29d..e8402189a474 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,8 +44,9 @@ RUN apt-get update && \ WORKDIR /workspace/ # Install Megatron-core -RUN git clone https://github.com/aklife97/Megatron-LM.git && \ +RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ + git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03 && \ pip install -e . WORKDIR /tmp/ diff --git a/Jenkinsfile b/Jenkinsfile index 3e4895715df4..54cce564b660 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -60,8 +60,9 @@ pipeline { // TODO: remove when pip package is available stage('Megatron Core installation') { steps { - sh 'git clone https://github.com/aklife97/Megatron-LM.git && \ + sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ + git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03 && \ pip install -e .' } } diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index 90053f3052c8..a7a22bb18150 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -311,6 +311,7 @@ def training_step(self, dataloader_iter, batch_idx): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), + enable_autocast=True, ) if losses_reduced_per_micro_batch: @@ -411,6 +412,7 @@ def validation_step(self, dataloader_iter, batch_idx): tensor_shape=tensor_shape, dtype=self.autocast_dtype, sequence_parallel=self.cfg.get('sequence_parallel', False), + enable_autocast=True, ) if losses_reduced_per_micro_batch: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 7fc48856453f..c4cfcfdad1ff 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -300,6 +300,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), + enable_autocast=True, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d4132c4c7e80..d8f90c500182 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -375,6 +375,7 @@ def training_step(self, dataloader_iter, batch_idx): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), + enable_autocast=True, ) # only the last stages of the pipeline return losses @@ -656,6 +657,7 @@ def validation_step(self, dataloader_iter, batch_idx): tensor_shape=tensor_shape, dtype=self.autocast_dtype, sequence_parallel=self.cfg.get('sequence_parallel', False), + enable_autocast=True, ) # only the last stage of the pipeline returns losses diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index 331136c64a46..49cb078cd462 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -309,6 +309,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), + enable_autocast=True, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index 94b2d348a61d..4f4bc0d709a8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -328,6 +328,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): decoder_seq_length=self.max_decoder_seq_length, dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + enable_autocast=True, ) # only the last stages of the pipeline return losses @@ -991,6 +992,7 @@ def dummy(): num_microbatches=1, decoder_seq_length=encoder_seq_length, dtype=self.autocast_dtype, + enable_autocast=True, ) if output_tensor: @@ -1154,6 +1156,7 @@ def dummy(): num_microbatches=1, decoder_seq_length=encoder_seq_length, dtype=self.autocast_dtype, + enable_autocast=True, ) # get output tensor if parallel_state.is_pipeline_last_stage(): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py index b3c08dff7ae8..4fce103ebc3b 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py @@ -197,6 +197,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), + enable_autocast=True, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py index efa059419eda..ff1888c1c9ea 100644 --- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py +++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py @@ -316,6 +316,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), + enable_autocast=True, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index a2e7f351ae09..07607d3840d8 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -62,6 +62,7 @@ def forward_step(self, batch, tensor_shape): forward_only=True, tensor_shape=tensor_shape, dtype=self.model.autocast_dtype, + enable_autocast=True, ) return output_tensor