@@ -809,29 +809,87 @@ def gguf_quant_weights_iterator(
809809) -> Generator [tuple [str , torch .Tensor ], None , None ]:
810810 """
811811 Iterate over the quant weights in the model gguf files and convert
812- them to torch tensors
812+ them to torch tensors.
813+
814+ This iterator handles both quantized and unquantized GGUF weights,
815+ applying model-specific corrections where needed (e.g., Gemma3 RMSNorm).
813816 """
814817
815818 reader = gguf .GGUFReader (gguf_file )
816819
820+ # Detect Gemma3 models to apply architecture-specific weight corrections.
821+ # Gemma3 uses a different RMSNorm convention than standard GGUF format:
822+ # - GGUF stores: full weight values
823+ # - vLLM expects: weight - 1.0 (due to "x * (1 + weight)" computation)
824+ is_gemma3 = False
825+ try :
826+ arch_field = reader .get_field ("general.architecture" )
827+ if arch_field and arch_field .parts [- 1 ].tobytes ().decode (
828+ 'utf-8' ) == "gemma3" :
829+ is_gemma3 = True
830+ logger .info (
831+ "Detected Gemma3 model: will apply RMSNorm weight correction" )
832+ except Exception :
833+ # Architecture field may not exist in older GGUF files
834+ pass
835+
836+ # First pass: yield quantization type metadata for GGUF quantized layers.
837+ # This metadata tells vLLM's GGUF quantization layers what format to expect.
817838 for tensor in reader .tensors :
818839 if tensor .name in gguf_to_hf_name_map :
819840 weight_type = tensor .tensor_type
820841 name = gguf_to_hf_name_map [tensor .name ]
821842
822- if weight_type .name != "F32" :
843+ # Only yield qweight_type for truly quantized weights
844+ # (not F16/BF16/F32). F16/BF16 embeddings are stored
845+ # unquantized and handled as regular PyTorch tensors,
846+ # so they should not get quantization metadata.
847+ if weight_type .name not in ("F32" , "F16" , "BF16" ):
823848 weight_type_name = name .replace ("weight" , "qweight_type" )
824- weight_type = torch .tensor (weight_type )
849+ # GGUF quantization layers expect qweight_type as
850+ # a 1D tensor [1] not a scalar []. This matches the
851+ # parameter shape created in
852+ # GGUFLinearMethod.create_weights()
853+ weight_type = torch .tensor ([weight_type ])
825854 yield weight_type_name , weight_type
826855
856+ # Second pass: yield actual weight data
827857 for tensor in reader .tensors :
828858 if tensor .name in gguf_to_hf_name_map :
829859 weight = tensor .data
830860 weight_type = tensor .tensor_type
831861 name = gguf_to_hf_name_map [tensor .name ]
832- if weight_type .name != "F32" :
862+
863+ # Handle quantized weights (Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, etc.)
864+ if weight_type .name not in ("F32" , "F16" , "BF16" ):
865+ # For quantized weights, yield raw GGUF tensor data.
866+ # The GGUF quantization layers will handle
867+ # dequantization on-demand during inference, keeping
868+ # weights compressed in GPU memory.
833869 name = name .replace ("weight" , "qweight" )
834- param = torch .tensor (weight )
870+ param = torch .tensor (weight )
871+ else :
872+ # Handle unquantized weights (F32/F16/BF16)
873+ # These are typically used for embeddings and bias terms
874+
875+ # Do NOT reshape F16/BF16 weights.
876+ # GGUF stores F16/BF16 data in the same memory layout
877+ # as PyTorch. While GGUF metadata may show transposed
878+ # dimensions, the raw data is already correct and
879+ # reshaping would corrupt it.
880+ param = torch .tensor (weight )
881+
882+ # Apply Gemma3-specific RMSNorm weight correction
883+ # GemmaRMSNorm computes: output = x * (1 + weight)
884+ # Standard PyTorch: output = x * weight
885+ #
886+ # GGUF stores full weight values (for x * weight)
887+ # but vLLM's GemmaRMSNorm expects (weight - 1) since
888+ # it adds 1 during forward pass. Without this
889+ # correction, the model produces gibberish output.
890+ if is_gemma3 and 'norm' in name and len (param .shape ) == 1 :
891+ param = param - 1.0
892+
835893 yield name , param
836894
837895
0 commit comments