Skip to content

Commit

Permalink
[Preset] Add gemma2 preset (#2715)
Browse files Browse the repository at this point in the history
Add gemma2 2b 9b and 27b to preset, remove gemma1 preset.
  • Loading branch information
CharlieFRuan authored Aug 1, 2024
1 parent 0561a9b commit 39069f7
Showing 1 changed file with 93 additions and 45 deletions.
138 changes: 93 additions & 45 deletions python/mlc_llm/model/model_preset.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,51 +636,99 @@
"use_cache": True,
"vocab_size": 103168,
},
# TODO(mlc-team): enable the model presets when stabilized.
# "gemma_2b": {
# "architectures": ["GemmaForCausalLM"],
# "attention_bias": False,
# "bos_token_id": 2,
# "eos_token_id": 1,
# "head_dim": 256,
# "hidden_act": "gelu",
# "hidden_size": 2048,
# "initializer_range": 0.02,
# "intermediate_size": 16384,
# "max_position_embeddings": 8192,
# "model_type": "gemma",
# "num_attention_heads": 8,
# "num_hidden_layers": 18,
# "num_key_value_heads": 1,
# "pad_token_id": 0,
# "rms_norm_eps": 1e-06,
# "rope_theta": 10000.0,
# "torch_dtype": "bfloat16",
# "transformers_version": "4.38.0.dev0",
# "vocab_size": 256000,
# },
# "gemma_7b": {
# "architectures": ["GemmaForCausalLM"],
# "attention_bias": False,
# "bos_token_id": 2,
# "eos_token_id": 1,
# "head_dim": 256,
# "hidden_act": "gelu",
# "hidden_size": 3072,
# "initializer_range": 0.02,
# "intermediate_size": 24576,
# "max_position_embeddings": 8192,
# "model_type": "gemma",
# "num_attention_heads": 16,
# "num_hidden_layers": 28,
# "num_key_value_heads": 16,
# "pad_token_id": 0,
# "rms_norm_eps": 1e-06,
# "rope_theta": 10000.0,
# "torch_dtype": "bfloat16",
# "transformers_version": "4.38.0.dev0",
# "vocab_size": 256000,
# },
"gemma2_2b": {
"architectures": ["Gemma2ForCausalLM"],
"attention_bias": False,
"attention_dropout": 0.0,
"attn_logit_softcapping": 50.0,
"bos_token_id": 2,
"cache_implementation": "hybrid",
"eos_token_id": [1, 107],
"final_logit_softcapping": 30.0,
"head_dim": 256,
"hidden_act": "gelu_pytorch_tanh",
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 2304,
"initializer_range": 0.02,
"intermediate_size": 9216,
"max_position_embeddings": 8192,
"model_type": "gemma2",
"num_attention_heads": 8,
"num_hidden_layers": 26,
"num_key_value_heads": 4,
"pad_token_id": 0,
"query_pre_attn_scalar": 256,
"rms_norm_eps": 1e-06,
"rope_theta": 10000.0,
"sliding_window": 4096,
"torch_dtype": "bfloat16",
"transformers_version": "4.42.4",
"use_cache": True,
"vocab_size": 256000,
},
"gemma2_9b": {
"architectures": ["Gemma2ForCausalLM"],
"attention_bias": False,
"attention_dropout": 0.0,
"attn_logit_softcapping": 50.0,
"bos_token_id": 2,
"cache_implementation": "hybrid",
"eos_token_id": 1,
"final_logit_softcapping": 30.0,
"head_dim": 256,
"hidden_act": "gelu_pytorch_tanh",
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 3584,
"initializer_range": 0.02,
"intermediate_size": 14336,
"max_position_embeddings": 8192,
"model_type": "gemma2",
"num_attention_heads": 16,
"num_hidden_layers": 42,
"num_key_value_heads": 8,
"pad_token_id": 0,
"query_pre_attn_scalar": 256,
"rms_norm_eps": 1e-06,
"rope_theta": 10000.0,
"sliding_window": 4096,
"sliding_window_size": 4096,
"torch_dtype": "bfloat16",
"transformers_version": "4.42.0.dev0",
"use_cache": True,
"vocab_size": 256000,
},
"gemma2_27b": {
"architectures": ["Gemma2ForCausalLM"],
"attention_bias": False,
"attention_dropout": 0.0,
"attn_logit_softcapping": 50.0,
"bos_token_id": 2,
"cache_implementation": "hybrid",
"eos_token_id": 1,
"final_logit_softcapping": 30.0,
"head_dim": 128,
"hidden_act": "gelu_pytorch_tanh",
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 4608,
"initializer_range": 0.02,
"intermediate_size": 36864,
"max_position_embeddings": 8192,
"model_type": "gemma2",
"num_attention_heads": 32,
"num_hidden_layers": 46,
"num_key_value_heads": 16,
"pad_token_id": 0,
"query_pre_attn_scalar": 144,
"rms_norm_eps": 1e-06,
"rope_theta": 10000.0,
"sliding_window": 4096,
"sliding_window_size": 4096,
"torch_dtype": "bfloat16",
"transformers_version": "4.42.0.dev0",
"use_cache": True,
"vocab_size": 256000,
"_attn_implementation": "eager",
},
"rwkv5_3b": {
"architectures": ["RwkvForCausalLM"],
"auto_map": {
Expand Down

0 comments on commit 39069f7

Please sign in to comment.