Skip to content

Commit c531edf

Browse files
authored
convert : fix conversion for llama 4 (ggml-org#13567)
1 parent 02cdd2d commit c531edf

File tree

2 files changed

+33
-30
lines changed

2 files changed

+33
-30
lines changed

convert_hf_to_gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2069,6 +2069,9 @@ def set_gguf_parameters(self):
20692069
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
20702070

20712071
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
2072+
if name.startswith("language_model."):
2073+
name = name.replace("language_model.", "")
2074+
20722075
# split the gate_up into gate and up
20732076
if "gate_up_proj" in name:
20742077
name_up = name.replace("gate_up_proj", "up_proj.weight")

gguf-py/gguf/tensor_mapping.py

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class TensorNameMap:
6868
"output_layer", # chatglm
6969
"head", # rwkv
7070
"head.out", # wavtokenizer
71-
"language_model.lm_head", # llama4
71+
"lm_head", # llama4
7272
),
7373

7474
# Output norm
@@ -91,7 +91,7 @@ class TensorNameMap:
9191
"rwkv.ln_out", # rwkv6
9292
"model.ln_out", # rwkv7
9393
"backbone.final_layer_norm", # wavtokenizer
94-
"language_model.model.norm", # llama4
94+
"model.norm", # llama4
9595
),
9696

9797
# Rope frequencies
@@ -133,7 +133,7 @@ class TensorNameMap:
133133
"transformer.layers.{bid}.attn_norm", # openelm
134134
"rwkv.blocks.{bid}.ln1", # rwkv6
135135
"model.layers.{bid}.ln1", # rwkv7
136-
"language_model.model.layers.{bid}.input_layernorm", # llama4
136+
"model.layers.{bid}.input_layernorm", # llama4
137137
),
138138

139139
# Attention norm 2
@@ -173,7 +173,7 @@ class TensorNameMap:
173173
"model.layers.{bid}.attention.wq", # internlm2
174174
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
175175
"transformer.h.{bid}.attn.attention.q_proj", # exaone
176-
"language_model.model.layers.{bid}.self_attn.q_proj", # llama4
176+
"model.layers.{bid}.self_attn.q_proj", # llama4
177177
),
178178

179179
# Attention key
@@ -188,7 +188,7 @@ class TensorNameMap:
188188
"model.layers.{bid}.attention.wk", # internlm2
189189
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
190190
"transformer.h.{bid}.attn.attention.k_proj", # exaone
191-
"language_model.model.layers.{bid}.self_attn.k_proj", # llama4
191+
"model.layers.{bid}.self_attn.k_proj", # llama4
192192
),
193193

194194
# Attention value
@@ -202,7 +202,7 @@ class TensorNameMap:
202202
"model.layers.{bid}.attention.wv", # internlm2
203203
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
204204
"transformer.h.{bid}.attn.attention.v_proj", # exaone
205-
"language_model.model.layers.{bid}.self_attn.v_proj", # llama4
205+
"model.layers.{bid}.self_attn.v_proj", # llama4
206206
),
207207

208208
# Attention output
@@ -229,7 +229,7 @@ class TensorNameMap:
229229
"encoder.layers.{bid}.self_attention.dense", # chatglm
230230
"transformer.layers.{bid}.attn.out_proj", # openelm
231231
"transformer.h.{bid}.attn.attention.out_proj", # exaone
232-
"language_model.model.layers.{bid}.self_attn.o_proj", # llama4
232+
"model.layers.{bid}.self_attn.o_proj", # llama4
233233
),
234234

235235
# Attention output norm
@@ -268,7 +268,7 @@ class TensorNameMap:
268268
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
269269
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
270270
"transformer.layers.{bid}.ffn_norm", # openelm
271-
"language_model.model.layers.{bid}.post_attention_layernorm", # llama4
271+
"model.layers.{bid}.post_attention_layernorm", # llama4
272272
),
273273

274274
# Post feed-forward norm
@@ -289,7 +289,7 @@ class TensorNameMap:
289289
"transformer.decoder_layer.{bid}.router", # Grok
290290
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
291291
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
292-
"language_model.model.layers.{bid}.feed_forward.router", # llama4
292+
"model.layers.{bid}.feed_forward.router", # llama4
293293
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
294294
),
295295

@@ -329,7 +329,7 @@ class TensorNameMap:
329329
"model.layers.{bid}.residual_mlp.w3", # arctic
330330
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
331331
"transformer.h.{bid}.mlp.c_fc_1", # exaone
332-
"language_model.model.layers.{bid}.feed_forward.up_proj", # llama4
332+
"model.layers.{bid}.feed_forward.up_proj", # llama4
333333
),
334334

335335
MODEL_TENSOR.FFN_UP_EXP: (
@@ -338,14 +338,14 @@ class TensorNameMap:
338338
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
339339
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
340340
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
341-
"language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4
341+
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
342342
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
343343
),
344344

345345
MODEL_TENSOR.FFN_UP_SHEXP: (
346-
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
347-
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
348-
"language_model.model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
346+
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
347+
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
348+
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
349349
),
350350

351351
# AWQ-activation gate
@@ -366,22 +366,22 @@ class TensorNameMap:
366366
"transformer.h.{bid}.mlp.linear_1", # refact
367367
"model.layers.{bid}.residual_mlp.w1", # arctic
368368
"transformer.h.{bid}.mlp.c_fc_0", # exaone
369-
"language_model.model.layers.{bid}.feed_forward.gate_proj", # llama4
369+
"model.layers.{bid}.feed_forward.gate_proj", # llama4
370370
),
371371

372372
MODEL_TENSOR.FFN_GATE_EXP: (
373-
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
374-
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
375-
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
376-
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
377-
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
378-
"language_model.model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
373+
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
374+
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
375+
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
376+
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
377+
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
378+
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
379379
),
380380

381381
MODEL_TENSOR.FFN_GATE_SHEXP: (
382-
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
383-
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
384-
"language_model.model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
382+
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
383+
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
384+
"model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
385385
),
386386

387387
# Feed-forward down
@@ -410,7 +410,7 @@ class TensorNameMap:
410410
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
411411
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
412412
"model.layers.h.{bid}.mlp.c_proj", # exaone
413-
"language_model.model.layers.{bid}.feed_forward.down_proj", # llama4
413+
"model.layers.{bid}.feed_forward.down_proj", # llama4
414414
),
415415

416416
MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -420,15 +420,15 @@ class TensorNameMap:
420420
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
421421
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
422422
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
423-
"language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4
423+
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
424424
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
425425
),
426426

427427
MODEL_TENSOR.FFN_DOWN_SHEXP: (
428-
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
429-
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
430-
"language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
431-
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
428+
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
429+
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
430+
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
431+
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
432432
),
433433

434434
MODEL_TENSOR.ATTN_Q_NORM: (

0 commit comments

Comments
 (0)