fix issues of legacy gemma models

mikecovlee · Jul 17, 2024 · edbb95c · edbb95c
1 parent e5b5d63
commit edbb95c
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 2 deletions.
diff --git a/mlora/common/feed_forward.py b/mlora/common/feed_forward.py
@@ -30,7 +30,10 @@ def forward(
 
     # MixLoRA
     def init_moe_weight(
-        self, args: LLMModelConfig, config: MixConfig, gate: Optional[torch.Tensor] = None
+        self,
+        args: LLMModelConfig,
+        config: MixConfig,
+        gate: Optional[torch.Tensor] = None,
     ):
         self.moes_[config.adapter_name] = moe_layer_factory(args, config)
         if gate is None:

diff --git a/mlora/models/modeling_gemma.py b/mlora/models/modeling_gemma.py
@@ -50,6 +50,13 @@ def forward(self, tokens: torch.Tensor) -> torch.Tensor:
         return data * normalizer
 
 
+def _patch_hidden_act(config: modeling_gemma.GemmaConfig) -> str:
+    if hasattr(config, "hidden_activation") and config.hidden_activation is not None:
+        return config.hidden_activation
+    else:
+        return config.hidden_act
+
+
 class GemmaForCausalLM(LlamaForCausalLM):
     def __init__(self, config: LlamaConfig) -> None:
         super().__init__(config)
@@ -71,7 +78,7 @@ def from_pretrained(
             n_layers_=llm_config.num_hidden_layers,
             n_heads_=llm_config.num_attention_heads,
             n_kv_heads_=llm_config.num_key_value_heads,
-            hidden_act_=llm_config.hidden_activation,
+            hidden_act_=_patch_hidden_act(llm_config),
             rms_norm_eps_=llm_config.rms_norm_eps,
             max_seq_len_=llm_config.max_position_embeddings,
             rope_theta_=llm_config.rope_theta,