Fix Qwen3 MoE GGUF architecture mismatch (#39976)

ctcanbol · MekkCyber · jusjinuk · web-flow · commit 085e02383c3d · 2025-08-12T13:38:48.000Z
* fix qwen3moe gguf architecture

* Fix Qwen3Moe GGUF loading

---------

Co-authored-by: Mohamed Mekkouri &lt;93391238+MekkCyber@users.noreply.github.com&gt;
Co-authored-by: Jinuk Kim &lt;jusjinuk@snu.ac.kr&gt;
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
@@ -102,13 +102,14 @@
         "attention.layer_norm_rms_epsilon": "rms_norm_eps",
         "vocab_size": "vocab_size",
     },
-    "qwen3moe": {
+    "qwen3_moe": {
         "context_length": "max_position_embeddings",
         "block_count": "num_hidden_layers",
         "feed_forward_length": "intermediate_size",
         "embedding_length": "hidden_size",
         "rope.dimension_count": None,
         "rope.freq_base": "rope_theta",
+        "attention.key_length": "head_dim",
         "attention.head_count": "num_attention_heads",
         "attention.head_count_kv": "num_key_value_heads",
         "attention.layer_norm_rms_epsilon": "rms_norm_eps",
diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -246,6 +246,7 @@ def process(self, weights, name, **kwargs):
 TENSOR_PROCESSORS = {
     "llama": LlamaTensorProcessor,
     "qwen2moe": Qwen2MoeTensorProcessor,
+    "qwen3moe": Qwen2MoeTensorProcessor,
     "bloom": BloomTensorProcessor,
     "t5": T5TensorProcessor,
     "t5encoder": T5TensorProcessor,
@@ -295,6 +296,8 @@ def get_gguf_hf_weights_map(
         model_type = "command-r"
     elif model_type == "qwen2_moe":
         model_type = "qwen2moe"
+    elif model_type == "qwen3_moe":
+        model_type = "qwen3moe"
     elif model_type == "gemma3_text":
         model_type = "gemma3"
     arch = None
@@ -316,8 +319,8 @@ def get_gguf_hf_weights_map(
     gguf_to_hf_name_map = {}
     state_dict = hf_model.state_dict()
     for hf_name in state_dict:
-        # An exception for qwen2moe model, where the expert layers are packed
-        if model_type == "qwen2moe" and "mlp.experts." in hf_name:
+        # An exception for qwen2moe/qwen3moe model, where the expert layers are packed
+        if model_type in ("qwen2moe", "qwen3moe") and "mlp.experts." in hf_name:
             hf_name = re.sub(r"mlp.experts.\d+.", "mlp.experts.", hf_name)
 
         name, suffix = hf_name, ""
@@ -391,6 +394,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
 
     if "qwen2moe" in architecture:
         updated_architecture = "qwen2_moe"
+    elif "qwen3moe" in architecture:
+        updated_architecture = "qwen3_moe"
 
     # For stablelm architecture, we need to set qkv_bias and use_parallel_residual from tensors
     # If `qkv_bias=True`, qkv_proj with bias will be present in the tensors