diff --git a/vllm/config.py b/vllm/config.py index d2b68b6fa1fe2..319c1569f5e98 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -168,13 +168,18 @@ def _verify_quantization(self) -> None: # Parse quantization method from the HF model config, if available. hf_quant_config = getattr(self.hf_config, "quantization_config", None) if hf_quant_config is not None: - hf_quant_method = str(hf_quant_config["quant_method"]).lower() + # If the GPTQ model is serialized in marlin format, use marlin. if (hf_quant_method == "gptq" and "is_marlin_format" in hf_quant_config and hf_quant_config["is_marlin_format"]): + logger.info("The model is serialized in Marlin format. " + "Using Marlin kernel.") hf_quant_method = "marlin" + if self.quantization == "gptq": + self.quantization = hf_quant_method + if self.quantization is None: self.quantization = hf_quant_method elif self.quantization != hf_quant_method: