From d270067248f5ef91ec19eeba99000c2fa7438b70 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar Date: Tue, 28 May 2024 04:16:31 -0700 Subject: [PATCH 1/4] move pooler under post_process Signed-off-by: Dmytro Pykhtar --- .../language_modeling/megatron/bert/bert_model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py index 67a4802d83f6..0d9bea25b5a6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py @@ -345,11 +345,6 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw transformer_block_type=self.transformer_block_type, ) - if self.add_pooler: - self.pooler = Pooler( - self.config.hidden_size, self.config.init_method, self.config, self.config.sequence_parallel - ) - # Output if self.post_process: # TODO: Make sure you are passing in the mpu_vocab_size properly @@ -377,6 +372,11 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw self.config.hidden_size, 2, self.config.init_method, self.config.perform_initialization ) + if self.add_pooler: + self.pooler = Pooler( + self.config.hidden_size, self.config.init_method, self.config, self.config.sequence_parallel + ) + def forward( self, input_ids: Tensor, From eeed847d23b9432742cf871252b2a969f98f9f3f Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar Date: Tue, 28 May 2024 06:30:12 -0700 Subject: [PATCH 2/4] move pooler under post_process Signed-off-by: Dmytro Pykhtar --- .../language_modeling/megatron/bert/bert_model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py index 0d9bea25b5a6..2ff6a2ae0a85 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py @@ -365,6 +365,11 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights, ) + if self.add_pooler: + self.pooler = Pooler( + self.config.hidden_size, self.config.init_method, self.config, self.config.sequence_parallel + ) + self.binary_head = None if self.add_binary_head: # TODO: Shoudl switch this to TE ? @@ -372,11 +377,6 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw self.config.hidden_size, 2, self.config.init_method, self.config.perform_initialization ) - if self.add_pooler: - self.pooler = Pooler( - self.config.hidden_size, self.config.init_method, self.config, self.config.sequence_parallel - ) - def forward( self, input_ids: Tensor, From 7fae9822fde07fd89db3052d2cee36a72b871de8 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar Date: Tue, 28 May 2024 06:31:49 -0700 Subject: [PATCH 3/4] change pp size to 2 for bert pp test Signed-off-by: Dmytro Pykhtar --- .github/workflows/cicd-main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 75188e38f68c..04626e3dfd34 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2798,7 +2798,7 @@ jobs: model.megatron_amp_O2=True \ trainer.gradient_clip_val=1.0 \ exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ + model.pipeline_model_parallel_size=2 \ model.optim.name=fused_adam \ model.optim.lr=2e-4 \ model.optim.sched.warmup_steps=2 \ @@ -2829,7 +2829,7 @@ jobs: trainer.gradient_clip_val=1.0 \ exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ + model.pipeline_model_parallel_size=2 \ model.optim.name=fused_adam \ model.optim.lr=2e-4 \ model.optim.sched.warmup_steps=2 \ From 56bb135560933a5d0611ae6b1249f71a3c6e18aa Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar Date: Tue, 28 May 2024 07:48:21 -0700 Subject: [PATCH 4/4] change precision for gpt mock data generation test Signed-off-by: Dmytro Pykhtar --- .github/workflows/cicd-main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 04626e3dfd34..83ef8a8b4339 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -5170,8 +5170,10 @@ jobs: trainer.max_steps=10 \ trainer.limit_val_batches=7 \ trainer.val_check_interval=10 \ + trainer.precision=16 \ exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ model.mcore_gpt=True \ + model.megatron_amp_O2=False \ model.data.data_impl=mock \ model.data.data_prefix=[] - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"