diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b64f6901dc47..d67bf4c6d381 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3060,13 +3060,13 @@ jobs: AFTER_SCRIPT: | rm -rf /home/TestData/nlp/megatron_ir/working_dir - L2_Megatron_GPT_PEFT_Lora_PP2: + L2_Megatron_GPT_PEFT_Lora_PP2_O2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml with: RUNNER: self-hosted-azure SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 + rm -rf /home/TestData/nlp/lora_tuning_pp2 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ trainer.devices=2 \ @@ -3075,11 +3075,12 @@ jobs: trainer.max_steps=3 \ trainer.val_check_interval=3 \ ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \ + trainer.precision=bf16 \ + exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_pp2 \ model.pipeline_model_parallel_size=2 \ model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ + model.megatron_amp_O2=True \ model.peft.peft_scheme=lora \ model.answer_only_loss=True \ model.micro_batch_size=1 \ @@ -3090,10 +3091,28 @@ jobs: model.data.validation_ds.num_workers=0 \ model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ model.data.validation_ds.names=[quarel] + + python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ + model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + trainer.devices=2 \ + model.megatron_amp_O2=True \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ + model.data.test_ds.names=['quarel4'] \ + model.global_batch_size=2 \ + model.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=10 \ + model.data.test_ds.write_predictions_to_file=True \ + model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl' AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 + rm -rf /home/TestData/nlp/lora_tuning_pp2 - L2_Megatron_GPT_PEFT_Lora_TP2: + L2_Megatron_GPT_PEFT_Lora_TP2_O1: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml with: @@ -3108,11 +3127,11 @@ jobs: trainer.max_steps=3 \ trainer.val_check_interval=3 \ ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ + trainer.precision=bf16 \ exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ model.pipeline_model_parallel_size=1 \ model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ model.peft.peft_scheme='lora' \ model.answer_only_loss=True \ model.micro_batch_size=1 \ @@ -3125,7 +3144,7 @@ jobs: model.data.validation_ds.names=[quarel] python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ model.tensor_model_parallel_size=2 \ trainer.devices=2 \ @@ -4234,8 +4253,8 @@ jobs: - L2_Megatron_GPT_Finetuning_PP2 - L2_Megatron_GPT_Finetuning_StarCoder_PP1 - L2_Megatron_GPT_Embedding - - L2_Megatron_GPT_PEFT_Lora_PP2 - - L2_Megatron_GPT_PEFT_Lora_TP2 + - L2_Megatron_GPT_PEFT_Lora_PP2_O2 + - L2_Megatron_GPT_PEFT_Lora_TP2_O1 - L2_Megatron_GPT_Eval - L2_Megatron_GPT_Eval_PP2 - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py index 9983aba84b56..7d294f6085bb 100644 --- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py @@ -109,11 +109,11 @@ def _get_all_keys( """ Returns all the keys in the model """ - k = [n for n, p in self._unwrap_model().named_parameters()] + k = [n for n, p in self._unwrap_model().named_parameters(prefix="model")] b = [ n - for n, p in self._unwrap_model().named_buffers() - if n.replace("model.module.", "model.", 1) in self._unwrap_model().state_dict().keys() + for n, p in self._unwrap_model().named_buffers(prefix="model") + if n.replace("model.module.", "model.", 1) in self._unwrap_model().state_dict(prefix="model.").keys() ] # we include buffers because ptuning representations are cached in a buffer and saved to state_dict for inference time use. return set(k + b) @@ -292,13 +292,13 @@ def setup_optimizer_param_groups(self): self.freeze(training=True) # Freeze the entire model if not self.ptuning_only_and_non_first_stage: opt_params = [] - for _, module in self._unwrap_model().named_modules(): + for _, module in self._unwrap_model().named_modules(prefix="model"): if isinstance(module, AdapterModuleMixin) and module.is_adapter_available(): module.set_enabled_adapters(enabled=True) module.unfreeze_enabled_adapters() # selectively unfreeze the adapter modules. opt_params += [p for p in module.parameters() if p.requires_grad] - for name, param in self._unwrap_model().named_parameters(): + for name, param in self._unwrap_model().named_parameters(prefix="model"): if name in self.tunable_base_param_keys: param.requires_grad = True opt_params += [param] @@ -397,11 +397,11 @@ def get_peft_state_dict(self): """ Gets the keys associated with the adapters only. """ - state_dict = self._unwrap_model().state_dict() + state_dict = self._unwrap_model().state_dict(prefix="model.") peft_state_dict = {} for k in self.adapter_keys.union(self.tunable_base_param_keys): # state_dict keys needs to be in non-O2 format and will be corrected in PEFTSaveRestoreConnector if O2=True - new_k = k.replace("module.", "", 1) + new_k = k.replace("model.module.", "model.", 1) peft_state_dict[new_k] = state_dict[new_k] return peft_state_dict