From ea728c2a7b5d04391fb918e30737448601082756 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Fri, 14 Jun 2024 16:23:27 -0700
Subject: [PATCH 1/5] fix unwrap model

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .../nlp/parts/mixins/nlp_adapter_mixins.py         | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 9983aba84b56..7d294f6085bb 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -109,11 +109,11 @@ def _get_all_keys(
         """
         Returns all the keys in the model
         """
-        k = [n for n, p in self._unwrap_model().named_parameters()]
+        k = [n for n, p in self._unwrap_model().named_parameters(prefix="model")]
         b = [
             n
-            for n, p in self._unwrap_model().named_buffers()
-            if n.replace("model.module.", "model.", 1) in self._unwrap_model().state_dict().keys()
+            for n, p in self._unwrap_model().named_buffers(prefix="model")
+            if n.replace("model.module.", "model.", 1) in self._unwrap_model().state_dict(prefix="model.").keys()
         ]
         # we include buffers because ptuning representations are cached in a buffer and saved to state_dict for inference time use.
         return set(k + b)
@@ -292,13 +292,13 @@ def setup_optimizer_param_groups(self):
             self.freeze(training=True)  # Freeze the entire model
             if not self.ptuning_only_and_non_first_stage:
                 opt_params = []
-                for _, module in self._unwrap_model().named_modules():
+                for _, module in self._unwrap_model().named_modules(prefix="model"):
                     if isinstance(module, AdapterModuleMixin) and module.is_adapter_available():
                         module.set_enabled_adapters(enabled=True)
                         module.unfreeze_enabled_adapters()  # selectively unfreeze the adapter modules.
                         opt_params += [p for p in module.parameters() if p.requires_grad]
 
-                for name, param in self._unwrap_model().named_parameters():
+                for name, param in self._unwrap_model().named_parameters(prefix="model"):
                     if name in self.tunable_base_param_keys:
                         param.requires_grad = True
                         opt_params += [param]
@@ -397,11 +397,11 @@ def get_peft_state_dict(self):
         """
         Gets the keys associated with the adapters only.
         """
-        state_dict = self._unwrap_model().state_dict()
+        state_dict = self._unwrap_model().state_dict(prefix="model.")
         peft_state_dict = {}
         for k in self.adapter_keys.union(self.tunable_base_param_keys):
             # state_dict keys needs to be in non-O2 format and will be corrected in PEFTSaveRestoreConnector if O2=True
-            new_k = k.replace("module.", "", 1)
+            new_k = k.replace("model.module.", "model.", 1)
             peft_state_dict[new_k] = state_dict[new_k]
         return peft_state_dict
 

From 67b81b81ce70f928e806433e17145d26a5b54abc Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Sat, 15 Jun 2024 09:10:09 -0700
Subject: [PATCH 2/5] add O2 to ci test

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .github/workflows/cicd-main.yml | 37 +++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index b64f6901dc47..e0b537097c12 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3060,13 +3060,13 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_ir/working_dir
 
-  L2_Megatron_GPT_PEFT_Lora_PP2:
+  L2_Megatron_GPT_PEFT_Lora_PP2_O2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
+        rm -rf /home/TestData/nlp/lora_tuning_pp2
 
         python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
@@ -3076,10 +3076,11 @@ jobs:
         trainer.val_check_interval=3 \
         ++trainer.limit_val_batches=2 \
         trainer.precision=16 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
+        exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_pp2 \
         model.pipeline_model_parallel_size=2 \
         model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+        model.megatron_amp_O2=True \
         model.peft.peft_scheme=lora \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \
@@ -3090,10 +3091,28 @@ jobs:
         model.data.validation_ds.num_workers=0 \
         model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
         model.data.validation_ds.names=[quarel]
+        
+        python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+        model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
+        model.tensor_model_parallel_size=2 \
+        trainer.devices=2 \
+        model.megatron_amp_O2=True \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
+        model.data.test_ds.names=['quarel4'] \
+        model.global_batch_size=2 \
+        model.micro_batch_size=1 \
+        model.data.test_ds.tokens_to_generate=10 \
+        model.data.test_ds.write_predictions_to_file=True \
+        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \
+        inference.greedy=True \
+        inference.repetition_penalty=1.0 \
+        inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'"
+        sh "rm -rf /home/TestData/nlp/lora_tuning_pp2
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
 
-  L2_Megatron_GPT_PEFT_Lora_TP2:
+  L2_Megatron_GPT_PEFT_Lora_TP2_O1:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     with:
@@ -3112,7 +3131,7 @@ jobs:
         exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
         model.peft.peft_scheme='lora' \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \
@@ -3125,7 +3144,7 @@ jobs:
         model.data.validation_ds.names=[quarel]
 
         python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
         model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
         model.tensor_model_parallel_size=2 \
         trainer.devices=2 \
@@ -4234,8 +4253,8 @@ jobs:
       - L2_Megatron_GPT_Finetuning_PP2
       - L2_Megatron_GPT_Finetuning_StarCoder_PP1
       - L2_Megatron_GPT_Embedding 
-      - L2_Megatron_GPT_PEFT_Lora_PP2
-      - L2_Megatron_GPT_PEFT_Lora_TP2
+      - L2_Megatron_GPT_PEFT_Lora_PP2_O2
+      - L2_Megatron_GPT_PEFT_Lora_TP2_O1
       - L2_Megatron_GPT_Eval
       - L2_Megatron_GPT_Eval_PP2
       - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len

From 2bb960524b0faf977510d950c024cab71e84a604 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Sat, 15 Jun 2024 10:52:23 -0700
Subject: [PATCH 3/5] fix ci test

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .github/workflows/cicd-main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index e0b537097c12..26850a65aa23 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3075,7 +3075,7 @@ jobs:
         trainer.max_steps=3 \
         trainer.val_check_interval=3 \
         ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
+        trainer.precision=bf16 \
         exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_pp2 \
         model.pipeline_model_parallel_size=2 \
         model.tensor_model_parallel_size=1 \
@@ -3127,7 +3127,7 @@ jobs:
         trainer.max_steps=3 \
         trainer.val_check_interval=3 \
         ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
+        trainer.precision=bf16 \
         exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \

From 57d311790cc91741d51210ee82ecbb0dab7fa7f6 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 17 Jun 2024 10:21:41 -0700
Subject: [PATCH 4/5] fix ci test

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .github/workflows/cicd-main.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 26850a65aa23..87204bd20101 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3107,10 +3107,9 @@ jobs:
         model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \
         inference.greedy=True \
         inference.repetition_penalty=1.0 \
-        inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'"
-        sh "rm -rf /home/TestData/nlp/lora_tuning_pp2
+        inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'
       AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
+        rm -rf /home/TestData/nlp/lora_tuning_pp2
 
   L2_Megatron_GPT_PEFT_Lora_TP2_O1:
     needs: [cicd-test-container-setup]

From 5d27d593039dbe1ba19b58bbc4675911949bcbaf Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 17 Jun 2024 15:14:17 -0700
Subject: [PATCH 5/5] fix ci test

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .github/workflows/cicd-main.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 87204bd20101..d67bf4c6d381 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3095,7 +3095,8 @@ jobs:
         python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
         model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
-        model.tensor_model_parallel_size=2 \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
         trainer.devices=2 \
         model.megatron_amp_O2=True \
         model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \