Skip to content
This repository has been archived by the owner on Aug 16, 2024. It is now read-only.

Commit

Permalink
Gemma2 Support (#78)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikecovlee authored Jul 18, 2024
1 parent 39cb1f4 commit d64e367
Show file tree
Hide file tree
Showing 13 changed files with 674 additions and 25 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ For users with NVIDIA Ampere or newer GPU architectures, the `--tf32` option can

+ Quantization with Qwen2 have no effect (same with transformers).
+ Applying quantization with DoRA will result in higher memory and computation cost (same with PEFT).
+ Sliding window attention with generate cache may product abnormal output.
+ ChatGLM models with generate cache may product abnormal output.

## Installation

Expand Down
30 changes: 19 additions & 11 deletions launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def compose_command(
log_file: str = "mlora.log",
overwrite: bool = False,
attn_impl: str = None,
sliding_window: bool = False,
use_cache: bool = True,
quantize: str = None,
dtype: str = "bf16",
tf32: bool = False,
Expand All @@ -37,6 +39,10 @@ def compose_command(
command += " --overwrite"
if attn_impl is not None:
command += f" --attn_impl {attn_impl}"
if sliding_window:
command += " --sliding_window"
if not use_cache:
command += " --disable_cache"
if quantize is not None:
command += f" --load_{quantize}"
if dtype in ("fp16", "bf16"):
Expand Down Expand Up @@ -201,17 +207,19 @@ def show_help():
--group_by_length
Arguments of run, train, inference and evaluate:
--base_model model name or path
--config [mlora.json]
--load_adapter [false]
--random_seed [42]
--cuda_device [0]
--log_file [mlora.log]
--overwrite [false]
--attn_impl [eager]
--quantize [none], 4bit, 8bit
--dtype [bf16], fp16, fp32
--tf32 [false]
--base_model model name or path
--config [mlora.json]
--load_adapter [false]
--random_seed [42]
--cuda_device [0]
--log_file [mlora.log]
--overwrite [false]
--attn_impl [eager]
--sliding_window [false]
--use_cache [true]
--quantize [none], 4bit, 8bit
--dtype [bf16], fp16, fp32
--tf32 [false]
"""
)

Expand Down
2 changes: 1 addition & 1 deletion mlora.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def inference(
tokenizer,
configs,
max_gen_len=128,
use_cache=args.disable_cache,
use_cache=not args.disable_cache,
concurrent_jobs=concurrent_jobs,
cache_implementation=args.cache_implementation,
stream_callback=callback,
Expand Down
6 changes: 3 additions & 3 deletions mlora/common/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def __init__(
config.max_seq_len_ if max_cache_len is None else max_cache_len
)
# Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
self.head_dim = config.dim_ // config.n_heads_
self.head_dim = config.head_dim_

self.dtype = dtype if dtype is not None else torch.float32
self.num_key_value_heads = config.n_kv_heads_
Expand Down Expand Up @@ -327,7 +327,7 @@ def __init__(
self.max_cache_len = max_cache_len
self.max_batch_size = max_batch_size
# Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
self.head_dim = config.dim_ // config.n_heads_
self.head_dim = config.head_dim_

self.dtype = dtype if dtype is not None else torch.float32
self.num_key_value_heads = config.n_kv_heads_
Expand Down Expand Up @@ -461,7 +461,7 @@ def get_max_length(self) -> Optional[int]:
return self.max_cache_len

def get_seq_length(self, layer_idx: Optional[int] = 0):
return None
return 0

def reset(self):
"""Resets the cache values while preserving the objects"""
Expand Down
2 changes: 1 addition & 1 deletion mlora/common/modelargs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class LLMModelConfig:
name_or_path_: str = ""
device_: str = ""
dim_: int = 4096
multiple_of_: int = 256
head_dim_: int = 256
intermediate_: int = 11008
n_heads_: int = 32
n_kv_heads_: int = 32
Expand Down
2 changes: 1 addition & 1 deletion mlora/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class CasualOutputLayer(LLMOutput):
def __init__(self, vocab_size: int, weight: torch.nn.Linear):
super().__init__()
self.vocab_size_: int = vocab_size
self.lm_head_: torch.nn.Linear = weight
self.lm_head_: torch.nn.Module = weight

def forward(self, data: torch.Tensor) -> torch.Tensor:
return self.lm_head_(data).float()
Expand Down
2 changes: 2 additions & 0 deletions mlora/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .modeling_chatglm import GLMForCausalLM
from .modeling_gemma import GemmaForCausalLM
from .modeling_gemma2 import Gemma2ForCausalLM
from .modeling_llama import LlamaForCausalLM
from .modeling_mistral import MistralForCausalLM
from .modeling_mistral import MistralForCausalLM as Qwen2ForCausalLM
Expand All @@ -8,6 +9,7 @@
model_dict = {
"llama": LlamaForCausalLM,
"gemma": GemmaForCausalLM,
"gemma2": Gemma2ForCausalLM,
"mistral": MistralForCausalLM,
"qwen2": Qwen2ForCausalLM,
"phi": PhiForCausalLM,
Expand Down
1 change: 1 addition & 0 deletions mlora/models/modeling_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def from_pretrained(
name_or_path_=llm_config.name_or_path,
vocab_size_=llm_config.vocab_size,
dim_=llm_config.hidden_size,
head_dim_=llm_config.head_dim,
intermediate_=llm_config.intermediate_size,
n_layers_=llm_config.num_hidden_layers,
n_heads_=llm_config.num_attention_heads,
Expand Down
Loading

0 comments on commit d64e367

Please sign in to comment.