[GGUF] Fix Gemma3 quantization support

lucianommartins · lucianommartins · commit 96ca10fa183e · 2025-10-05T20:50:15.000Z
This commit implements complete GGUF quantization support for Gemma3 models with true Q4_0 compression, addressing gibberish output and enabling 50% memory reduction. Changes: 1. gguf_loader.py: Add gemma3_text -> gemma3 model type mapping 2. gemma3.py: - Add Gemma3 RMSNorm weight correction (-1.0 offset) - Fix qweight_type tensor shape (scalar -> [1]) - Fix F16 embedding handling (no reshape needed) - Enable GGUF quantization in linear layers - Handle UninitializedParameter for GGUF layers Key fixes: - RMSNorm correction: Gemma3 uses (1+weight) convention but GGUF stores full values, requiring -1.0 subtraction - F16 embeddings: GGUF raw data is already in PyTorch layout, preventing data corruption from unnecessary reshape operations - qweight_type shape: GGUF layers expect shape [1] not scalar [] Tested on: - 8 Gemma3 variants (1B-27B parameters) - Both instruction-tuned and pretrained versions - Q4_0 quantization format - 100% success rate with coherent text generation Fixes #14753, #15480 Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com>
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
@@ -63,6 +63,10 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
         # hack: ggufs have a different name than transformers
         if model_type == "cohere":
             model_type = "command-r"
+        if model_type == "gemma3_text":
+            # Gemma3 models use "gemma3_text" in HuggingFace but
+            # "gemma3" in GGUF architecture naming
+            model_type = "gemma3"
         if model_type in ("deepseek_v3", "deepseek_v2"):
             model_type = "deepseek2"
             # GGUF layer map assumes that we will have a merged expert weights
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
@@ -435,6 +435,17 @@ def load_weights(self, weights: Iterable[tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
+            # Apply GGUF-specific RMSNorm weight correction for Gemma3
+            # This must happen BEFORE any transformations (transpose, etc.)
+            # GemmaRMSNorm computes: output = x * (1 + weight)
+            # GGUF stores full weight values (for standard x * weight)
+            # but vLLM's GemmaRMSNorm expects (weight - 1) since it adds 1
+            # during the forward pass.
+            if (self.quant_config is not None
+                    and self.quant_config.get_name() == "gguf"
+                    and 'norm' in name and len(loaded_weight.shape) == 1):
+                loaded_weight = loaded_weight - 1.0
+
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
                 # Loading kv cache scales for compressed-tensors quantization