From a404527e90173bacfd55a9c8c0b9fc47dda32e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=87a=C4=9Fr=C4=B1=20Tu=C4=9Frul=20Canbol?= <103742287+ctcanbol@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:09:50 +0200 Subject: [PATCH 1/4] support loading qwen3 gguf --- src/transformers/integrations/ggml.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 17f86e166737..ecf34bbf5e19 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -102,6 +102,20 @@ "attention.layer_norm_rms_epsilon": "rms_norm_eps", "vocab_size": "vocab_size", }, + "qwen3moe": { + "context_length": "max_position_embeddings", + "block_count": "num_hidden_layers", + "feed_forward_length": "intermediate_size", + "embedding_length": "hidden_size", + "rope.dimension_count": None, + "rope.freq_base": "rope_theta", + "attention.head_count": "num_attention_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_rms_epsilon": "rms_norm_eps", + "vocab_size": "vocab_size", + "expert_count": "num_experts", + "expert_used_count": "num_experts_per_tok", + }, "falcon": { "context_length": "max_position_embeddings", "block_count": "num_hidden_layers", @@ -689,6 +703,7 @@ def converted(self) -> Tokenizer: "qwen2": GGUFQwen2Converter, "qwen2_moe": GGUFQwen2Converter, "qwen3": GGUFQwen2Converter, + "qwen3_moe": GGUFQwen2Converter, "phi3": GGUFPhi3Converter, "bloom": GGUFGPTConverter, "falcon": GGUFGPTConverter, From 6982becb69009efba40f206005b55ebac5e519dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=87a=C4=9Fr=C4=B1=20Tu=C4=9Frul=20Canbol?= <103742287+ctcanbol@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:43:07 +0200 Subject: [PATCH 2/4] qwen3moe test cases --- tests/quantization/ggml/test_ggml.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index d2ed7f7a745d..6c60bab12bd5 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -302,6 +302,7 @@ class GgufModelTests(unittest.TestCase): gemma3_text_model_id = "unsloth/gemma-3-1b-it-GGUF" gemma3_vision_model_id = "unsloth/gemma-3-4b-it-GGUF" qwen3_model_id = "Qwen/Qwen3-0.6B-GGUF" + qwen3moe_model_id = "Qwen/Qwen3-235B-A22B-Instruct-2507" q4_0_phi3_model_id = "Phi-3-mini-4k-instruct-q4.gguf" q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf" @@ -335,6 +336,7 @@ class GgufModelTests(unittest.TestCase): bf16_gemma3_text_model_id = "gemma-3-1b-it-BF16.gguf" bf16_gemma3_vision_model_id = "gemma-3-4b-it-BF16.gguf" q8_0_qwen3_model_id = "Qwen3-0.6B-Q8_0.gguf" + q8_qwen3moe_model_id = "unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF" example_text = "Hello" @@ -973,3 +975,17 @@ def test_qwen3_q8_0(self): EXPECTED_TEXT = "HelloED\nI need to find the value of the" self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + + def test_qwen3moe_q8(self): + tokenizer = AutoTokenizer.from_pretrained(self.qwen3moe_model_id, gguf_file=self.q8_qwen3moe_model_id) + model = AutoModelForCausalLM.from_pretrained( + self.qwen3moe_model_id, + gguf_file=self.q8_qwen3moe_model_id, + torch_dtype=torch.float16, + ) + + text = tokenizer(self.example_text, return_tensors="pt") + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = "Hello, I am a 20 year old male" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) From 237bc7043b5f74a3a1de413f40c8ea9f27b5b7cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=87a=C4=9Fr=C4=B1=20Tu=C4=9Frul=20Canbol?= <103742287+ctcanbol@users.noreply.github.com> Date: Thu, 24 Jul 2025 17:12:14 +0200 Subject: [PATCH 3/4] fix whitespaces --- tests/quantization/ggml/test_ggml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index 6c60bab12bd5..13f49015909f 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -975,7 +975,7 @@ def test_qwen3_q8_0(self): EXPECTED_TEXT = "HelloED\nI need to find the value of the" self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) - + def test_qwen3moe_q8(self): tokenizer = AutoTokenizer.from_pretrained(self.qwen3moe_model_id, gguf_file=self.q8_qwen3moe_model_id) model = AutoModelForCausalLM.from_pretrained( From 60118d2e1bfdaf2c3715fe0d24cef99c047a9ae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=87a=C4=9Fr=C4=B1=20Tu=C4=9Frul=20Canbol?= <103742287+ctcanbol@users.noreply.github.com> Date: Tue, 29 Jul 2025 02:08:26 +0200 Subject: [PATCH 4/4] fix ggml tests --- tests/quantization/ggml/test_ggml.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index 13f49015909f..408ab47a6296 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -302,7 +302,7 @@ class GgufModelTests(unittest.TestCase): gemma3_text_model_id = "unsloth/gemma-3-1b-it-GGUF" gemma3_vision_model_id = "unsloth/gemma-3-4b-it-GGUF" qwen3_model_id = "Qwen/Qwen3-0.6B-GGUF" - qwen3moe_model_id = "Qwen/Qwen3-235B-A22B-Instruct-2507" + qwen3moe_model_id = "Qwen/Qwen3-30B-A3B-GGUF" q4_0_phi3_model_id = "Phi-3-mini-4k-instruct-q4.gguf" q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf" @@ -336,7 +336,7 @@ class GgufModelTests(unittest.TestCase): bf16_gemma3_text_model_id = "gemma-3-1b-it-BF16.gguf" bf16_gemma3_vision_model_id = "gemma-3-4b-it-BF16.gguf" q8_0_qwen3_model_id = "Qwen3-0.6B-Q8_0.gguf" - q8_qwen3moe_model_id = "unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF" + q4_k_m_qwen3moe_model_id = "Qwen3-30B-A3B-Q4_K_M.gguf" example_text = "Hello" @@ -976,11 +976,11 @@ def test_qwen3_q8_0(self): EXPECTED_TEXT = "HelloED\nI need to find the value of the" self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) - def test_qwen3moe_q8(self): - tokenizer = AutoTokenizer.from_pretrained(self.qwen3moe_model_id, gguf_file=self.q8_qwen3moe_model_id) + def test_qwen3moe_q4_k_m(self): + tokenizer = AutoTokenizer.from_pretrained(self.qwen3moe_model_id, gguf_file=self.q4_k_m_qwen3moe_model_id) model = AutoModelForCausalLM.from_pretrained( self.qwen3moe_model_id, - gguf_file=self.q8_qwen3moe_model_id, + gguf_file=self.q4_k_m_qwen3moe_model_id, torch_dtype=torch.float16, )