@@ -68,7 +68,7 @@ class TensorNameMap:
68
68
"output_layer" , # chatglm
69
69
"head" , # rwkv
70
70
"head.out" , # wavtokenizer
71
- "language_model. lm_head" , # llama4
71
+ "lm_head" , # llama4
72
72
),
73
73
74
74
# Output norm
@@ -91,7 +91,7 @@ class TensorNameMap:
91
91
"rwkv.ln_out" , # rwkv6
92
92
"model.ln_out" , # rwkv7
93
93
"backbone.final_layer_norm" , # wavtokenizer
94
- "language_model. model.norm" , # llama4
94
+ "model.norm" , # llama4
95
95
),
96
96
97
97
# Rope frequencies
@@ -133,7 +133,7 @@ class TensorNameMap:
133
133
"transformer.layers.{bid}.attn_norm" , # openelm
134
134
"rwkv.blocks.{bid}.ln1" , # rwkv6
135
135
"model.layers.{bid}.ln1" , # rwkv7
136
- "language_model. model.layers.{bid}.input_layernorm" , # llama4
136
+ "model.layers.{bid}.input_layernorm" , # llama4
137
137
),
138
138
139
139
# Attention norm 2
@@ -173,7 +173,7 @@ class TensorNameMap:
173
173
"model.layers.{bid}.attention.wq" , # internlm2
174
174
"transformer.decoder_layer.{bid}.multi_head_attention.query" ,# Grok
175
175
"transformer.h.{bid}.attn.attention.q_proj" , # exaone
176
- "language_model. model.layers.{bid}.self_attn.q_proj" , # llama4
176
+ "model.layers.{bid}.self_attn.q_proj" , # llama4
177
177
),
178
178
179
179
# Attention key
@@ -188,7 +188,7 @@ class TensorNameMap:
188
188
"model.layers.{bid}.attention.wk" , # internlm2
189
189
"transformer.decoder_layer.{bid}.multi_head_attention.key" ,# Grok
190
190
"transformer.h.{bid}.attn.attention.k_proj" , # exaone
191
- "language_model. model.layers.{bid}.self_attn.k_proj" , # llama4
191
+ "model.layers.{bid}.self_attn.k_proj" , # llama4
192
192
),
193
193
194
194
# Attention value
@@ -202,7 +202,7 @@ class TensorNameMap:
202
202
"model.layers.{bid}.attention.wv" , # internlm2
203
203
"transformer.decoder_layer.{bid}.multi_head_attention.value" ,# Grok
204
204
"transformer.h.{bid}.attn.attention.v_proj" , # exaone
205
- "language_model. model.layers.{bid}.self_attn.v_proj" , # llama4
205
+ "model.layers.{bid}.self_attn.v_proj" , # llama4
206
206
),
207
207
208
208
# Attention output
@@ -229,7 +229,7 @@ class TensorNameMap:
229
229
"encoder.layers.{bid}.self_attention.dense" , # chatglm
230
230
"transformer.layers.{bid}.attn.out_proj" , # openelm
231
231
"transformer.h.{bid}.attn.attention.out_proj" , # exaone
232
- "language_model. model.layers.{bid}.self_attn.o_proj" , # llama4
232
+ "model.layers.{bid}.self_attn.o_proj" , # llama4
233
233
),
234
234
235
235
# Attention output norm
@@ -268,7 +268,7 @@ class TensorNameMap:
268
268
"transformer.decoder_layer.{bid}.rms_norm_2" , # Grok
269
269
"encoder.layers.{bid}.post_attention_layernorm" , # chatglm
270
270
"transformer.layers.{bid}.ffn_norm" , # openelm
271
- "language_model. model.layers.{bid}.post_attention_layernorm" , # llama4
271
+ "model.layers.{bid}.post_attention_layernorm" , # llama4
272
272
),
273
273
274
274
# Post feed-forward norm
@@ -289,7 +289,7 @@ class TensorNameMap:
289
289
"transformer.decoder_layer.{bid}.router" , # Grok
290
290
"transformer.blocks.{bid}.ffn.router.layer" , # dbrx
291
291
"model.layers.{bid}.block_sparse_moe.router.layer" , # granitemoe
292
- "language_model. model.layers.{bid}.feed_forward.router" , # llama4
292
+ "model.layers.{bid}.feed_forward.router" , # llama4
293
293
"encoder.layers.{bid}.mlp.router.layer" , # nomic-bert-moe
294
294
),
295
295
@@ -329,7 +329,7 @@ class TensorNameMap:
329
329
"model.layers.{bid}.residual_mlp.w3" , # arctic
330
330
"encoder.layers.{bid}.mlp.dense_h_to_4h" , # chatglm
331
331
"transformer.h.{bid}.mlp.c_fc_1" , # exaone
332
- "language_model. model.layers.{bid}.feed_forward.up_proj" , # llama4
332
+ "model.layers.{bid}.feed_forward.up_proj" , # llama4
333
333
),
334
334
335
335
MODEL_TENSOR .FFN_UP_EXP : (
@@ -338,14 +338,14 @@ class TensorNameMap:
338
338
"transformer.blocks.{bid}.ffn.experts.mlp.v1" , # dbrx
339
339
"model.layers.{bid}.mlp.experts.up_proj" , # qwen2moe olmoe (merged)
340
340
"model.layers.{bid}.block_sparse_moe.experts.w3" , # phimoe (merged)
341
- "language_model. model.layers.{bid}.feed_forward.experts.up_proj" , # llama4
341
+ "model.layers.{bid}.feed_forward.experts.up_proj" , # llama4
342
342
"encoder.layers.{bid}.mlp.experts.mlp.w1" , # nomic-bert-moe
343
343
),
344
344
345
345
MODEL_TENSOR .FFN_UP_SHEXP : (
346
- "model.layers.{bid}.mlp.shared_expert.up_proj" , # qwen2moe
347
- "model.layers.{bid}.mlp.shared_experts.up_proj" , # deepseek deepseek2
348
- "language_model. model.layers.{bid}.feed_forward.shared_expert.up_proj" , # llama4
346
+ "model.layers.{bid}.mlp.shared_expert.up_proj" , # qwen2moe
347
+ "model.layers.{bid}.mlp.shared_experts.up_proj" , # deepseek deepseek2
348
+ "model.layers.{bid}.feed_forward.shared_expert.up_proj" , # llama4
349
349
),
350
350
351
351
# AWQ-activation gate
@@ -366,22 +366,22 @@ class TensorNameMap:
366
366
"transformer.h.{bid}.mlp.linear_1" , # refact
367
367
"model.layers.{bid}.residual_mlp.w1" , # arctic
368
368
"transformer.h.{bid}.mlp.c_fc_0" , # exaone
369
- "language_model. model.layers.{bid}.feed_forward.gate_proj" , # llama4
369
+ "model.layers.{bid}.feed_forward.gate_proj" , # llama4
370
370
),
371
371
372
372
MODEL_TENSOR .FFN_GATE_EXP : (
373
- "layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
374
- "transformer.decoder_layer.{bid}.moe.linear" , # Grok (merged)
375
- "transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
376
- "model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe olmoe (merged)
377
- "model.layers.{bid}.block_sparse_moe.experts.w1" , # phimoe (merged)
378
- "language_model. model.layers.{bid}.feed_forward.experts.gate_proj" , # llama4
373
+ "layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
374
+ "transformer.decoder_layer.{bid}.moe.linear" , # Grok (merged)
375
+ "transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
376
+ "model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe olmoe (merged)
377
+ "model.layers.{bid}.block_sparse_moe.experts.w1" , # phimoe (merged)
378
+ "model.layers.{bid}.feed_forward.experts.gate_proj" , # llama4
379
379
),
380
380
381
381
MODEL_TENSOR .FFN_GATE_SHEXP : (
382
- "model.layers.{bid}.mlp.shared_expert.gate_proj" , # qwen2moe
383
- "model.layers.{bid}.mlp.shared_experts.gate_proj" , # deepseek deepseek2
384
- "language_model. model.layers.{bid}.feed_forward.shared_expert.gate_proj" , # llama4
382
+ "model.layers.{bid}.mlp.shared_expert.gate_proj" , # qwen2moe
383
+ "model.layers.{bid}.mlp.shared_experts.gate_proj" , # deepseek deepseek2
384
+ "model.layers.{bid}.feed_forward.shared_expert.gate_proj" , # llama4
385
385
),
386
386
387
387
# Feed-forward down
@@ -410,7 +410,7 @@ class TensorNameMap:
410
410
"encoder.layer.{bid}.mlp.down_layer" , # jina-bert-v2
411
411
"encoder.layers.{bid}.mlp.dense_4h_to_h" , # chatglm
412
412
"model.layers.h.{bid}.mlp.c_proj" , # exaone
413
- "language_model. model.layers.{bid}.feed_forward.down_proj" , # llama4
413
+ "model.layers.{bid}.feed_forward.down_proj" , # llama4
414
414
),
415
415
416
416
MODEL_TENSOR .FFN_DOWN_EXP : (
@@ -420,15 +420,15 @@ class TensorNameMap:
420
420
"model.layers.{bid}.mlp.experts.down_proj" , # qwen2moe olmoe (merged)
421
421
"model.layers.{bid}.block_sparse_moe.output_linear" , # granitemoe
422
422
"model.layers.{bid}.block_sparse_moe.experts.w2" , # phimoe (merged)
423
- "language_model. model.layers.{bid}.feed_forward.experts.down_proj" , # llama4
423
+ "model.layers.{bid}.feed_forward.experts.down_proj" , # llama4
424
424
"encoder.layers.{bid}.mlp.experts.mlp.w2" , # nomic-bert-moe
425
425
),
426
426
427
427
MODEL_TENSOR .FFN_DOWN_SHEXP : (
428
- "model.layers.{bid}.mlp.shared_expert.down_proj" , # qwen2moe
429
- "model.layers.{bid}.mlp.shared_experts.down_proj" , # deepseek deepseek2
430
- "language_model. model.layers.{bid}.feed_forward.shared_expert.down_proj" , # llama4
431
- "model.layers.{bid}.shared_mlp.output_linear" , # granitemoe
428
+ "model.layers.{bid}.mlp.shared_expert.down_proj" , # qwen2moe
429
+ "model.layers.{bid}.mlp.shared_experts.down_proj" , # deepseek deepseek2
430
+ "model.layers.{bid}.feed_forward.shared_expert.down_proj" , # llama4
431
+ "model.layers.{bid}.shared_mlp.output_linear" , # granitemoe
432
432
),
433
433
434
434
MODEL_TENSOR .ATTN_Q_NORM : (
0 commit comments