Gemma2 Support (#78)

mikecovlee · Jul 18, 2024 · d64e367 · d64e367
1 parent 39cb1f4
commit d64e367
Show file tree

Hide file tree

Showing 13 changed files with 674 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -107,6 +107,8 @@ For users with NVIDIA Ampere or newer GPU architectures, the `--tf32` option can
 
  + Quantization with Qwen2 have no effect (same with transformers).
  + Applying quantization with DoRA will result in higher memory and computation cost (same with PEFT).
+ + Sliding window attention with generate cache may product abnormal output.
+ + ChatGLM models with generate cache may product abnormal output.
 
 ## Installation
 

diff --git a/launch.py b/launch.py
@@ -18,6 +18,8 @@ def compose_command(
     log_file: str = "mlora.log",
     overwrite: bool = False,
     attn_impl: str = None,
+    sliding_window: bool = False,
+    use_cache: bool = True,
     quantize: str = None,
     dtype: str = "bf16",
     tf32: bool = False,
@@ -37,6 +39,10 @@ def compose_command(
         command += " --overwrite"
     if attn_impl is not None:
         command += f" --attn_impl {attn_impl}"
+    if sliding_window:
+        command += " --sliding_window"
+    if not use_cache:
+        command += " --disable_cache"
     if quantize is not None:
         command += f" --load_{quantize}"
     if dtype in ("fp16", "bf16"):
@@ -201,17 +207,19 @@ def show_help():
         --group_by_length
 
     Arguments of run, train, inference and evaluate:
-        --base_model   model name or path
-        --config       [mlora.json]
-        --load_adapter [false]
-        --random_seed  [42]
-        --cuda_device  [0]
-        --log_file     [mlora.log]
-        --overwrite    [false]
-        --attn_impl    [eager]
-        --quantize     [none], 4bit, 8bit
-        --dtype        [bf16], fp16, fp32
-        --tf32         [false]
+        --base_model     model name or path
+        --config         [mlora.json]
+        --load_adapter   [false]
+        --random_seed    [42]
+        --cuda_device    [0]
+        --log_file       [mlora.log]
+        --overwrite      [false]
+        --attn_impl      [eager]
+        --sliding_window [false]
+        --use_cache      [true]
+        --quantize       [none], 4bit, 8bit
+        --dtype          [bf16], fp16, fp32
+        --tf32           [false]
     """
     )
 

diff --git a/mlora.py b/mlora.py
@@ -251,7 +251,7 @@ def inference(
             tokenizer,
             configs,
             max_gen_len=128,
-            use_cache=args.disable_cache,
+            use_cache=not args.disable_cache,
             concurrent_jobs=concurrent_jobs,
             cache_implementation=args.cache_implementation,
             stream_callback=callback,

diff --git a/mlora/common/cache.py b/mlora/common/cache.py
@@ -160,7 +160,7 @@ def __init__(
             config.max_seq_len_ if max_cache_len is None else max_cache_len
         )
         # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
-        self.head_dim = config.dim_ // config.n_heads_
+        self.head_dim = config.head_dim_
 
         self.dtype = dtype if dtype is not None else torch.float32
         self.num_key_value_heads = config.n_kv_heads_
@@ -327,7 +327,7 @@ def __init__(
         self.max_cache_len = max_cache_len
         self.max_batch_size = max_batch_size
         # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
-        self.head_dim = config.dim_ // config.n_heads_
+        self.head_dim = config.head_dim_
 
         self.dtype = dtype if dtype is not None else torch.float32
         self.num_key_value_heads = config.n_kv_heads_
@@ -461,7 +461,7 @@ def get_max_length(self) -> Optional[int]:
         return self.max_cache_len
 
     def get_seq_length(self, layer_idx: Optional[int] = 0):
-        return None
+        return 0
 
     def reset(self):
         """Resets the cache values while preserving the objects"""

diff --git a/mlora/common/modelargs.py b/mlora/common/modelargs.py
@@ -23,7 +23,7 @@ class LLMModelConfig:
     name_or_path_: str = ""
     device_: str = ""
     dim_: int = 4096
-    multiple_of_: int = 256
+    head_dim_: int = 256
     intermediate_: int = 11008
     n_heads_: int = 32
     n_kv_heads_: int = 32

diff --git a/mlora/model.py b/mlora/model.py
@@ -38,7 +38,7 @@ class CasualOutputLayer(LLMOutput):
     def __init__(self, vocab_size: int, weight: torch.nn.Linear):
         super().__init__()
         self.vocab_size_: int = vocab_size
-        self.lm_head_: torch.nn.Linear = weight
+        self.lm_head_: torch.nn.Module = weight
 
     def forward(self, data: torch.Tensor) -> torch.Tensor:
         return self.lm_head_(data).float()

diff --git a/mlora/models/__init__.py b/mlora/models/__init__.py
@@ -1,5 +1,6 @@
 from .modeling_chatglm import GLMForCausalLM
 from .modeling_gemma import GemmaForCausalLM
+from .modeling_gemma2 import Gemma2ForCausalLM
 from .modeling_llama import LlamaForCausalLM
 from .modeling_mistral import MistralForCausalLM
 from .modeling_mistral import MistralForCausalLM as Qwen2ForCausalLM
@@ -8,6 +9,7 @@
 model_dict = {
     "llama": LlamaForCausalLM,
     "gemma": GemmaForCausalLM,
+    "gemma2": Gemma2ForCausalLM,
     "mistral": MistralForCausalLM,
     "qwen2": Qwen2ForCausalLM,
     "phi": PhiForCausalLM,

diff --git a/mlora/models/modeling_gemma.py b/mlora/models/modeling_gemma.py
@@ -74,6 +74,7 @@ def from_pretrained(
             name_or_path_=llm_config.name_or_path,
             vocab_size_=llm_config.vocab_size,
             dim_=llm_config.hidden_size,
+            head_dim_=llm_config.head_dim,
             intermediate_=llm_config.intermediate_size,
             n_layers_=llm_config.num_hidden_layers,
             n_heads_=llm_config.num_attention_heads,