From ea728c2a7b5d04391fb918e30737448601082756 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Fri, 14 Jun 2024 16:23:27 -0700 Subject: [PATCH 1/5] fix unwrap model Signed-off-by: Chen Cui --- .../nlp/parts/mixins/nlp_adapter_mixins.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py index 9983aba84b56..7d294f6085bb 100644 --- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py @@ -109,11 +109,11 @@ def _get_all_keys( """ Returns all the keys in the model """ - k = [n for n, p in self._unwrap_model().named_parameters()] + k = [n for n, p in self._unwrap_model().named_parameters(prefix="model")] b = [ n - for n, p in self._unwrap_model().named_buffers() - if n.replace("model.module.", "model.", 1) in self._unwrap_model().state_dict().keys() + for n, p in self._unwrap_model().named_buffers(prefix="model") + if n.replace("model.module.", "model.", 1) in self._unwrap_model().state_dict(prefix="model.").keys() ] # we include buffers because ptuning representations are cached in a buffer and saved to state_dict for inference time use. return set(k + b) @@ -292,13 +292,13 @@ def setup_optimizer_param_groups(self): self.freeze(training=True) # Freeze the entire model if not self.ptuning_only_and_non_first_stage: opt_params = [] - for _, module in self._unwrap_model().named_modules(): + for _, module in self._unwrap_model().named_modules(prefix="model"): if isinstance(module, AdapterModuleMixin) and module.is_adapter_available(): module.set_enabled_adapters(enabled=True) module.unfreeze_enabled_adapters() # selectively unfreeze the adapter modules. opt_params += [p for p in module.parameters() if p.requires_grad] - for name, param in self._unwrap_model().named_parameters(): + for name, param in self._unwrap_model().named_parameters(prefix="model"): if name in self.tunable_base_param_keys: param.requires_grad = True opt_params += [param] @@ -397,11 +397,11 @@ def get_peft_state_dict(self): """ Gets the keys associated with the adapters only. """ - state_dict = self._unwrap_model().state_dict() + state_dict = self._unwrap_model().state_dict(prefix="model.") peft_state_dict = {} for k in self.adapter_keys.union(self.tunable_base_param_keys): # state_dict keys needs to be in non-O2 format and will be corrected in PEFTSaveRestoreConnector if O2=True - new_k = k.replace("module.", "", 1) + new_k = k.replace("model.module.", "model.", 1) peft_state_dict[new_k] = state_dict[new_k] return peft_state_dict From 67b81b81ce70f928e806433e17145d26a5b54abc Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Sat, 15 Jun 2024 09:10:09 -0700 Subject: [PATCH 2/5] add O2 to ci test Signed-off-by: Chen Cui --- .github/workflows/cicd-main.yml | 37 +++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b64f6901dc47..e0b537097c12 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3060,13 +3060,13 @@ jobs: AFTER_SCRIPT: | rm -rf /home/TestData/nlp/megatron_ir/working_dir - L2_Megatron_GPT_PEFT_Lora_PP2: + L2_Megatron_GPT_PEFT_Lora_PP2_O2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml with: RUNNER: self-hosted-azure SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 + rm -rf /home/TestData/nlp/lora_tuning_pp2 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ trainer.devices=2 \ @@ -3076,10 +3076,11 @@ jobs: trainer.val_check_interval=3 \ ++trainer.limit_val_batches=2 \ trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \ + exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_pp2 \ model.pipeline_model_parallel_size=2 \ model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ + model.megatron_amp_O2=True \ model.peft.peft_scheme=lora \ model.answer_only_loss=True \ model.micro_batch_size=1 \ @@ -3090,10 +3091,28 @@ jobs: model.data.validation_ds.num_workers=0 \ model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ model.data.validation_ds.names=[quarel] + + python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ + model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ + model.tensor_model_parallel_size=2 \ + trainer.devices=2 \ + model.megatron_amp_O2=True \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ + model.data.test_ds.names=['quarel4'] \ + model.global_batch_size=2 \ + model.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=10 \ + model.data.test_ds.write_predictions_to_file=True \ + model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'" + sh "rm -rf /home/TestData/nlp/lora_tuning_pp2 AFTER_SCRIPT: | rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 - L2_Megatron_GPT_PEFT_Lora_TP2: + L2_Megatron_GPT_PEFT_Lora_TP2_O1: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml with: @@ -3112,7 +3131,7 @@ jobs: exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ model.pipeline_model_parallel_size=1 \ model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ model.peft.peft_scheme='lora' \ model.answer_only_loss=True \ model.micro_batch_size=1 \ @@ -3125,7 +3144,7 @@ jobs: model.data.validation_ds.names=[quarel] python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ model.tensor_model_parallel_size=2 \ trainer.devices=2 \ @@ -4234,8 +4253,8 @@ jobs: - L2_Megatron_GPT_Finetuning_PP2 - L2_Megatron_GPT_Finetuning_StarCoder_PP1 - L2_Megatron_GPT_Embedding - - L2_Megatron_GPT_PEFT_Lora_PP2 - - L2_Megatron_GPT_PEFT_Lora_TP2 + - L2_Megatron_GPT_PEFT_Lora_PP2_O2 + - L2_Megatron_GPT_PEFT_Lora_TP2_O1 - L2_Megatron_GPT_Eval - L2_Megatron_GPT_Eval_PP2 - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len From 2bb960524b0faf977510d950c024cab71e84a604 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Sat, 15 Jun 2024 10:52:23 -0700 Subject: [PATCH 3/5] fix ci test Signed-off-by: Chen Cui --- .github/workflows/cicd-main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index e0b537097c12..26850a65aa23 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3075,7 +3075,7 @@ jobs: trainer.max_steps=3 \ trainer.val_check_interval=3 \ ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ + trainer.precision=bf16 \ exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_pp2 \ model.pipeline_model_parallel_size=2 \ model.tensor_model_parallel_size=1 \ @@ -3127,7 +3127,7 @@ jobs: trainer.max_steps=3 \ trainer.val_check_interval=3 \ ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ + trainer.precision=bf16 \ exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ model.pipeline_model_parallel_size=1 \ model.tensor_model_parallel_size=2 \ From 57d311790cc91741d51210ee82ecbb0dab7fa7f6 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 17 Jun 2024 10:21:41 -0700 Subject: [PATCH 4/5] fix ci test Signed-off-by: Chen Cui --- .github/workflows/cicd-main.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 26850a65aa23..87204bd20101 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3107,10 +3107,9 @@ jobs: model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \ inference.greedy=True \ inference.repetition_penalty=1.0 \ - inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'" - sh "rm -rf /home/TestData/nlp/lora_tuning_pp2 + inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl' AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 + rm -rf /home/TestData/nlp/lora_tuning_pp2 L2_Megatron_GPT_PEFT_Lora_TP2_O1: needs: [cicd-test-container-setup] From 5d27d593039dbe1ba19b58bbc4675911949bcbaf Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 17 Jun 2024 15:14:17 -0700 Subject: [PATCH 5/5] fix ci test Signed-off-by: Chen Cui --- .github/workflows/cicd-main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 87204bd20101..d67bf4c6d381 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3095,7 +3095,8 @@ jobs: python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ - model.tensor_model_parallel_size=2 \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ trainer.devices=2 \ model.megatron_amp_O2=True \ model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \