From 0b754e09b02fc05a02f257df81867ed12d4dc239 Mon Sep 17 00:00:00 2001 From: jingyu Date: Mon, 11 Aug 2025 20:28:03 +0000 Subject: [PATCH] Qwen FP8 Support Signed-off-by: jingyu --- vllm/config/__init__.py | 5 +++-- vllm/model_executor/models/qwen3_moe.py | 11 ++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 03ab034c62c1..9ad85edef669 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1066,8 +1066,9 @@ def _parse_quant_hf_config(self): # Set quant_method for ModelOpt models. producer_name = quant_cfg.get("producer", {}).get("name") if producer_name == "modelopt": - quant_algo = quant_cfg.get("quantization", - {}).get("quant_algo") + quant_algo = (quant_cfg.get("quantization", + {}).get("quant_algo") + or quant_cfg.get("quant_algo")) if quant_algo == "FP8": quant_cfg["quant_method"] = "modelopt" elif quant_algo == "NVFP4": diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 9b49952f3724..05b15bbb50c0 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -455,6 +455,12 @@ def load_weights(self, weights: Iterable[tuple[str, # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: continue + if name.endswith("scale"): + remapped_name = maybe_remap_kv_scale_name( + name, params_dict) + if remapped_name is None: + continue + name = remapped_name # We have mlp.experts[0].gate_proj in the checkpoint. # Since we handle the experts below in expert_params_mapping, # we need to skip here BEFORE we update the name, otherwise @@ -475,8 +481,11 @@ def load_weights(self, weights: Iterable[tuple[str, if name.endswith("scale"): # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: + remapped_name = maybe_remap_kv_scale_name( + name, params_dict) + if remapped_name is None: continue + name = remapped_name if name not in params_dict: continue