From 0316a97c8de3c14fed5b44abbac5e47887ad5ae2 Mon Sep 17 00:00:00 2001 From: arendu Date: Tue, 22 Aug 2023 23:30:08 +0000 Subject: [PATCH 1/9] remove old prompt table for storing cached ptunig representations Signed-off-by: arendu --- .../megatron_gpt_peft_models.py | 30 +++++++++---------- .../megatron/adapters/parallel_adapters.py | 18 ++++++----- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index 7a2fd88d0c49..c0c1373033dc 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -329,11 +329,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens self.trainable_keys = self.adapter_keys - set( [ - "model.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight", - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight", # for Float16Model models + "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight" + "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models ] ) - # we exclude the above parameter from training because it is present for backward compatibility for inference using FasterTransformer (@adithyare) def init_peft_modules(self,): """ @@ -430,24 +429,23 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): } super().__init__(cfg, trainer) self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens - - def setup_optimizer_param_groups(self): - super().setup_optimizer_param_groups() - - # (guyueh1) This part is used to avoid adding frozen parameters in trainable adapter modules - # in the setup_optimizer_param_groups() of the MegatronPEFTModel class, all parameters - # in an adapter module are going to be set requires_grad=True. However in ptuning - # adapter the inference table should be untrainable. We explicitely set that parameter - # to untrainable here. self.trainable_keys = self.adapter_keys - set( [ - "model.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight", - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight", # for Float16Model or BFloat16Model models + "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight" + "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models ] ) + + def setup_optimizer_param_groups(self): + self.freeze() # Freeze the entire model + opt_params = [] for n, p in self.named_parameters(): - if not (n in self.trainable_keys): - p.requires_grad_(False) + if n in self.trainable_keys: + p.requires_grad = True + opt_params.append(p) + + self._optimizer_param_groups = ({"params": opt_params},) + logging.info(f"Optimizer groups set:\n{self.summarize()}") class MegatronGPTLoRAModel(MegatronGPTLayerwisePEFTModel): diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index ff5b304fe4c3..8af84ea6f0a2 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -26,7 +26,6 @@ from nemo.collections.common.parts.utils import activation_registry from nemo.collections.nlp.modules.common.megatron.fused_bias_gelu import fused_bias_gelu from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults, init_method_const, init_method_normal -from nemo.collections.nlp.modules.common.prompt_encoder import InferenceTable from nemo.core.classes.mixins import adapter_mixin_strategies try: @@ -322,7 +321,9 @@ def __init__( # (@adithyare) the persistent=False will not pollute the indices into the state_dict of this module. self.register_buffer("indices", torch.LongTensor(list(range(self.virtual_tokens))), persistent=False) self.embedding = torch.nn.Embedding(self.virtual_tokens, self.embedding_dim) - self.inference_table = InferenceTable("taskname", self.output_dim, self.virtual_tokens) + self.inference_table = nn.Embedding(self.virtual_tokens, self.output_dim) + self.inference_table.requires_grad = False + self.is_inference_ready = False self.first = ColumnParallelLinear( self.embedding_dim, self.bottleneck_dim, @@ -356,13 +357,16 @@ def set_inference_table(self, prompt_representation: torch.Tensor): This method caches the output representation from the Encoder and saves it inside `self.inference_table`. """ prompt_representation = prompt_representation.detach().clone() - self.inference_table.set_prompt_table(prompt_representation) + self.inference_table.weight.data = prompt_representation + self.is_inference_ready = True + return True def clear_inference_table(self,): - self.inference_table.clear_prompt_table() + self.is_inference_ready = False + self.inference_table.weight.data.fill_(0.0) def get_inference_table(self,): - return self.inference_table.get_prompt_table() + return self.inference_table.weight.data def inner_forward(self,): input_embeds = self.embedding(self.indices).unsqueeze(0) @@ -381,11 +385,11 @@ def forward(self, batch_size: int, use_cached_reps: bool = False) -> torch.Tenso output_embeds = self.get_inference_table().unsqueeze(1) else: if self.training: - if self.inference_table.is_inference_ready: + if self.is_inference_ready: self.clear_inference_table() output_embeds = self.inner_forward() else: - if not self.inference_table.is_inference_ready: + if not self.is_inference_ready: output_embeds = self.inner_forward() self.set_inference_table(output_embeds.squeeze(1)) output_embeds = self.get_inference_table().unsqueeze(1) From 45f0722c944aa20c13036ed44a0d50a826d31d77 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 22 Aug 2023 23:33:04 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/models/language_modeling/megatron_gpt_peft_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index c0c1373033dc..328ca3a76fea 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -330,7 +330,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.trainable_keys = self.adapter_keys - set( [ "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight" - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models + "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models ] ) @@ -432,7 +432,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.trainable_keys = self.adapter_keys - set( [ "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight" - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models + "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models ] ) From 8df609c4b13bca38c419c7ed3aef95ef0e241150 Mon Sep 17 00:00:00 2001 From: arendu Date: Wed, 23 Aug 2023 05:24:17 +0000 Subject: [PATCH 3/9] typo fix Signed-off-by: arendu --- .../nlp/models/language_modeling/megatron_gpt_peft_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index c0c1373033dc..02cd5cd89fb9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -329,7 +329,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens self.trainable_keys = self.adapter_keys - set( [ - "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight" + "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight", "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models ] ) @@ -431,7 +431,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens self.trainable_keys = self.adapter_keys - set( [ - "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight" + "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight", "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models ] ) From bb1790dc98950a76cf3e39a04dfd2ddab6a94736 Mon Sep 17 00:00:00 2001 From: arendu Date: Wed, 23 Aug 2023 05:26:33 +0000 Subject: [PATCH 4/9] update typo Signed-off-by: arendu --- .../language_modeling/megatron_gpt_peft_models.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index 8274f01b238c..02cd5cd89fb9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -329,13 +329,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens self.trainable_keys = self.adapter_keys - set( [ -<<<<<<< HEAD "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight", "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models -======= - "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight" - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models ->>>>>>> 45f0722c944aa20c13036ed44a0d50a826d31d77 ] ) @@ -436,13 +431,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens self.trainable_keys = self.adapter_keys - set( [ -<<<<<<< HEAD "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight", "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models -======= - "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight" - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models ->>>>>>> 45f0722c944aa20c13036ed44a0d50a826d31d77 ] ) From 85f90169236ba67ef71d56cd77f0769e84984550 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 23 Aug 2023 05:27:49 +0000 Subject: [PATCH 5/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/models/language_modeling/megatron_gpt_peft_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index 02cd5cd89fb9..50c85588cc86 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -330,7 +330,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.trainable_keys = self.adapter_keys - set( [ "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight", - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models + "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight", # for Float16Model models ] ) @@ -432,7 +432,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.trainable_keys = self.adapter_keys - set( [ "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight", - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models + "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight", # for Float16Model models ] ) From ead4cb318ecc4e07e732d731b96bfba72b7678a4 Mon Sep 17 00:00:00 2001 From: arendu Date: Wed, 23 Aug 2023 22:07:21 +0000 Subject: [PATCH 6/9] wip Signed-off-by: arendu --- .../models/language_modeling/megatron_gpt_peft_models.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index 02cd5cd89fb9..68b4473f23f3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -389,6 +389,7 @@ def setup_optimizer_param_groups(self): self.freeze() # Freeze the entire model self._optimizer_param_groups = ({"params": []},) logging.info(f"Optimizer groups set:\n{self.summarize()}") + print('ok') class MegatronGPTAdapterPTuningModel(MegatronGPTPEFTModel): @@ -443,9 +444,17 @@ def setup_optimizer_param_groups(self): if n in self.trainable_keys: p.requires_grad = True opt_params.append(p) + #else: + #p.requires_grad = False self._optimizer_param_groups = ({"params": opt_params},) logging.info(f"Optimizer groups set:\n{self.summarize()}") + print("ok") + + def setup_optimizer_param_groups__skipp(self): + super().setup_optimizer_param_groups() + logging.info(f"Optimizer groups set:\n{self.summarize()}") + print("ok") class MegatronGPTLoRAModel(MegatronGPTLayerwisePEFTModel): From 1d013df14154ad6ec38cd522ac780dec628e1d57 Mon Sep 17 00:00:00 2001 From: arendu Date: Thu, 24 Aug 2023 21:14:35 +0000 Subject: [PATCH 7/9] wip Signed-off-by: arendu --- .../megatron_gpt_peft_models.py | 25 ++++++++++++------- .../megatron/adapters/parallel_adapters.py | 22 ++++++++-------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index 68b4473f23f3..0893fa0e9a11 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -81,7 +81,8 @@ def get_all_keys(self,): Returns all the keys in the model """ k = [n for n, p in self.named_parameters()] - return set(k) + b = [n for n, p in self.named_buffers() if n in self.state_dict().keys()] + return set(k + b) def get_peft_state_dict(self,): """ @@ -137,6 +138,9 @@ def setup_optimizer_param_groups(self): opt_params += [p for p in module.parameters() if p.requires_grad] self._optimizer_param_groups = ({"params": opt_params},) + for p in opt_params: + print('sum of opts', p.sum()) + logging.info(f"Optimizer groups len:\n{len(opt_params)}") logging.info(f"Optimizer groups set:\n{self.summarize()}") @@ -329,8 +333,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens self.trainable_keys = self.adapter_keys - set( [ - "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight", - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models + "model.language_model.adapter_layer.ptuning_adapter.inference_table", + "model.module.language_model.adapter_layer.ptuning_adapter.inference_table" # for Float16Model models ] ) @@ -432,26 +436,29 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens self.trainable_keys = self.adapter_keys - set( [ - "model.language_model.adapter_layer.ptuning_adapter.inference_table.weight", - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table.weight" # for Float16Model models + "model.language_model.adapter_layer.ptuning_adapter.inference_table", + "model.module.language_model.adapter_layer.ptuning_adapter.inference_table" # for Float16Model models ] ) - def setup_optimizer_param_groups(self): + def setup_optimizer_param_groups_skip(self): self.freeze() # Freeze the entire model opt_params = [] for n, p in self.named_parameters(): if n in self.trainable_keys: p.requires_grad = True opt_params.append(p) - #else: - #p.requires_grad = False + else: + p.requires_grad = False self._optimizer_param_groups = ({"params": opt_params},) + for p in opt_params: + print('sum of opts', p.sum()) + logging.info(f"Optimizer groups len:\n{len(opt_params)}") logging.info(f"Optimizer groups set:\n{self.summarize()}") print("ok") - def setup_optimizer_param_groups__skipp(self): + def setup_optimizer_param_groups_skip(self): super().setup_optimizer_param_groups() logging.info(f"Optimizer groups set:\n{self.summarize()}") print("ok") diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 8af84ea6f0a2..511a81c39b07 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -321,8 +321,7 @@ def __init__( # (@adithyare) the persistent=False will not pollute the indices into the state_dict of this module. self.register_buffer("indices", torch.LongTensor(list(range(self.virtual_tokens))), persistent=False) self.embedding = torch.nn.Embedding(self.virtual_tokens, self.embedding_dim) - self.inference_table = nn.Embedding(self.virtual_tokens, self.output_dim) - self.inference_table.requires_grad = False + self.register_buffer("inference_table", torch.Tensor(self.virtual_tokens, self.output_dim), persistent=True) self.is_inference_ready = False self.first = ColumnParallelLinear( self.embedding_dim, @@ -357,16 +356,16 @@ def set_inference_table(self, prompt_representation: torch.Tensor): This method caches the output representation from the Encoder and saves it inside `self.inference_table`. """ prompt_representation = prompt_representation.detach().clone() - self.inference_table.weight.data = prompt_representation + self.inference_table.data = prompt_representation self.is_inference_ready = True return True def clear_inference_table(self,): + self.inference_table.fill_(0.0) self.is_inference_ready = False - self.inference_table.weight.data.fill_(0.0) def get_inference_table(self,): - return self.inference_table.weight.data + return self.inference_table.data def inner_forward(self,): input_embeds = self.embedding(self.indices).unsqueeze(0) @@ -385,14 +384,15 @@ def forward(self, batch_size: int, use_cached_reps: bool = False) -> torch.Tenso output_embeds = self.get_inference_table().unsqueeze(1) else: if self.training: - if self.is_inference_ready: - self.clear_inference_table() + #if self.is_inference_ready: + # self.clear_inference_table() output_embeds = self.inner_forward() else: - if not self.is_inference_ready: - output_embeds = self.inner_forward() - self.set_inference_table(output_embeds.squeeze(1)) - output_embeds = self.get_inference_table().unsqueeze(1) + output_embeds = self.inner_forward() + #if not self.is_inference_ready: + # output_embeds = self.inner_forward() + # self.set_inference_table(output_embeds.squeeze(1)) + #output_embeds = self.get_inference_table().unsqueeze(1) output_embeds = output_embeds.expand(self.virtual_tokens, batch_size, self.output_dim) return output_embeds From 39ba5f62e4c38608760e07657b6f6af2b3c56d71 Mon Sep 17 00:00:00 2001 From: arendu Date: Thu, 24 Aug 2023 23:15:09 +0000 Subject: [PATCH 8/9] using buffer to cache ptuning table Signed-off-by: arendu --- .../megatron_gpt_peft_models.py | 56 ++----------------- .../megatron/adapters/parallel_adapters.py | 12 ++-- 2 files changed, 11 insertions(+), 57 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index 0893fa0e9a11..6dfd2c9394b8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -81,7 +81,8 @@ def get_all_keys(self,): Returns all the keys in the model """ k = [n for n, p in self.named_parameters()] - b = [n for n, p in self.named_buffers() if n in self.state_dict().keys()] + b = [n for n, p in self.named_buffers() if n in self.state_dict().keys()] + # we include buffers because ptuning representations are cached in a buffer and saved to state_dict for inference time use. return set(k + b) def get_peft_state_dict(self,): @@ -136,10 +137,7 @@ def setup_optimizer_param_groups(self): module.set_enabled_adapters(enabled=True) module.unfreeze_enabled_adapters() # selectively unfreeze the adapter modules. opt_params += [p for p in module.parameters() if p.requires_grad] - self._optimizer_param_groups = ({"params": opt_params},) - for p in opt_params: - print('sum of opts', p.sum()) logging.info(f"Optimizer groups len:\n{len(opt_params)}") logging.info(f"Optimizer groups set:\n{self.summarize()}") @@ -214,7 +212,6 @@ def __init__( self.layer_selection = list(range(1, cfg.num_layers + 1)) super().__init__(cfg, trainer) - class MegatronGPTAdapterModelWeightTying(MegatronGPTLayerwisePEFTModel): """ TODO @@ -331,12 +328,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.name_key_to_cfg = {AdapterName.PTUNING_ADAPTER: adapter_cfg} super().__init__(cfg, trainer) self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens - self.trainable_keys = self.adapter_keys - set( - [ - "model.language_model.adapter_layer.ptuning_adapter.inference_table", - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table" # for Float16Model models - ] - ) + def init_peft_modules(self,): """ @@ -380,20 +372,11 @@ def load_state_dict(self, state_dict, strict: bool = True): def setup_optimizer_param_groups(self): if self.first_stage_of_pipeline(): - # super().setup_optimizer_param_groups() - self.freeze() # Freeze the entire model - opt_params = [] - for n, p in self.named_parameters(): - if n in self.trainable_keys: - p.requires_grad = True - opt_params.append(p) - - self._optimizer_param_groups = ({"params": opt_params},) + super().setup_optimizer_param_groups() else: self.freeze() # Freeze the entire model self._optimizer_param_groups = ({"params": []},) logging.info(f"Optimizer groups set:\n{self.summarize()}") - print('ok') class MegatronGPTAdapterPTuningModel(MegatronGPTPEFTModel): @@ -434,36 +417,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): } super().__init__(cfg, trainer) self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens - self.trainable_keys = self.adapter_keys - set( - [ - "model.language_model.adapter_layer.ptuning_adapter.inference_table", - "model.module.language_model.adapter_layer.ptuning_adapter.inference_table" # for Float16Model models - ] - ) - - def setup_optimizer_param_groups_skip(self): - self.freeze() # Freeze the entire model - opt_params = [] - for n, p in self.named_parameters(): - if n in self.trainable_keys: - p.requires_grad = True - opt_params.append(p) - else: - p.requires_grad = False - - self._optimizer_param_groups = ({"params": opt_params},) - for p in opt_params: - print('sum of opts', p.sum()) - logging.info(f"Optimizer groups len:\n{len(opt_params)}") - logging.info(f"Optimizer groups set:\n{self.summarize()}") - print("ok") - - def setup_optimizer_param_groups_skip(self): - super().setup_optimizer_param_groups() - logging.info(f"Optimizer groups set:\n{self.summarize()}") - print("ok") - - + class MegatronGPTLoRAModel(MegatronGPTLayerwisePEFTModel): """ MegatronGPTLoRAModel is a model that combines a base model (GPTSFTModel) with a low-rank adapters. diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 511a81c39b07..e10733f47c34 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -384,15 +384,15 @@ def forward(self, batch_size: int, use_cached_reps: bool = False) -> torch.Tenso output_embeds = self.get_inference_table().unsqueeze(1) else: if self.training: - #if self.is_inference_ready: - # self.clear_inference_table() + if self.is_inference_ready: + self.clear_inference_table() output_embeds = self.inner_forward() else: output_embeds = self.inner_forward() - #if not self.is_inference_ready: - # output_embeds = self.inner_forward() - # self.set_inference_table(output_embeds.squeeze(1)) - #output_embeds = self.get_inference_table().unsqueeze(1) + if not self.is_inference_ready: + output_embeds = self.inner_forward() + self.set_inference_table(output_embeds.squeeze(1)) + output_embeds = self.get_inference_table().unsqueeze(1) output_embeds = output_embeds.expand(self.virtual_tokens, batch_size, self.output_dim) return output_embeds From 697a87ad0b4681f22c392fef867089d86b95c487 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Aug 2023 23:19:25 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../models/language_modeling/megatron_gpt_peft_models.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index 84fd00385e67..bf177122c6a1 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -81,7 +81,7 @@ def get_all_keys(self,): Returns all the keys in the model """ k = [n for n, p in self.named_parameters()] - b = [n for n, p in self.named_buffers() if n in self.state_dict().keys()] + b = [n for n, p in self.named_buffers() if n in self.state_dict().keys()] # we include buffers because ptuning representations are cached in a buffer and saved to state_dict for inference time use. return set(k + b) @@ -211,6 +211,7 @@ def __init__( self.layer_selection = list(range(1, cfg.num_layers + 1)) super().__init__(cfg, trainer) + class MegatronGPTAdapterModelWeightTying(MegatronGPTLayerwisePEFTModel): """ TODO @@ -327,7 +328,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.name_key_to_cfg = {AdapterName.PTUNING_ADAPTER: adapter_cfg} super().__init__(cfg, trainer) self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens - def init_peft_modules(self,): """ @@ -416,7 +416,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): } super().__init__(cfg, trainer) self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens - + + class MegatronGPTLoRAModel(MegatronGPTLayerwisePEFTModel): """ MegatronGPTLoRAModel is a model that combines a base model (GPTSFTModel) with a low-rank adapters.