support pangu moe w8a8c8

GDzhu01 · GDzhu01 · commit 7de3736b424d · 2025-06-28T16:26:29.000+08:00
Signed-off-by: zhuyilin &lt;809721801@qq.com&gt;
diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md
@@ -32,6 +32,7 @@ The following table lists the additional configuration options available in vLLM
 | `refresh`                     | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case.     |
 | `expert_map_path`             | str  | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
 | `chunked_prefill_for_mla`     | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
+| `kv_cache_dtype`     | str | `None` | When using the kv cache quantization method, kv cache dtype needs to be set, currently only int8 is supported. |
 
 The details of each config option are as follows:
 
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -69,6 +69,15 @@ def get_kv_cache_shape(
                     16)
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
+    @staticmethod
+    def get_bsh_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, block_size, num_kv_heads * head_size)
+
     @staticmethod
     def swap_blocks(
         src_kv_cache: List[torch.Tensor],
@@ -279,6 +288,13 @@ def forward(
                 value=value,
                 output=output,
                 layer_name=layer.layer_name)
+
+        elif hasattr(layer, 'quant_method'):
+            output = layer.quant_method.apply(layer, query, key, value,
+                                              kv_cache, attn_metadata,
+                                              self.attn_type, self.scale,
+                                              output)
+
         else:
             if attn_metadata is None:
                 return output.view(num_tokens, self.hidden_size)
@@ -308,11 +324,8 @@ def forward(
                     value_cache=self.value_cache,
                     slot_indices=slots)
 
-            if hasattr(layer, 'quant_method'):
-                # TODO: Add attr (num_prefills, prefill_metadata, decode_metadata) to AscendMetadata
-                pass
             # V0-Style scheduler situation.
-            elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
+            if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
                 assert attn_metadata is not None
                 assert attn_metadata.attn_mask is not None
                 mask = attn_metadata.attn_mask
@@ -414,6 +427,8 @@ def forward(
                         out=output)
 
         # to make in-place change to the output tensor
+        if hasattr(layer, 'quant_method'):
+            output = output.view(num_tokens, self.num_heads, self.head_size)
         ori_output[:, :, :] = output[:num_tokens, :, :]
         return output.view(num_tokens, self.hidden_size)
 
diff --git a/vllm_ascend/models/pangu_moe.py b/vllm_ascend/models/pangu_moe.py
@@ -505,7 +505,7 @@ def forward(
             # native FusedMoE. here we need to design a better FusedMoE
             # (maybe using AscendFusedMoE) to enable these different
             # communication schema.
-            final_hidden_states = self.experts.quant_method(
+            final_hidden_states = self.experts.quant_method.apply(
                 layer=self.experts,
                 x=hidden_states,
                 router_logits=router_logits,
@@ -937,6 +937,8 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        tp_size = get_tp_group().world_size
+        tp_rank = get_tp_group().rank_in_group
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -972,6 +974,51 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             if "module" in name:
                 continue
 
+            if name.endswith('kv_cache_offset'):
+                continue
+
+            if name.endswith("k_proj.kv_cache_scale"):
+                remapped_kv_scale_name = name.replace(
+                    "k_proj.kv_cache_scale", "attn.key_antiquant_scale")
+                if remapped_kv_scale_name not in params_dict:
+                    logger.warning_once(
+                        "Found kv scale in the checkpoint "
+                        f"(e.g. {name}), but not found the expected "
+                        f"name in the model "
+                        f"(e.g. {remapped_kv_scale_name}). "
+                        "kv-scale is not loaded.")
+                    continue
+                else:
+                    name = remapped_kv_scale_name
+                    param = params_dict[name]
+                    loaded_weight = torch.tensor_split(loaded_weight,
+                                                       tp_size,
+                                                       dim=0)[tp_rank]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+
+            if name.endswith("v_proj.kv_cache_scale"):
+                remapped_kv_scale_name = name.replace(
+                    "v_proj.kv_cache_scale", "attn.value_antiquant_scale")
+                if remapped_kv_scale_name not in params_dict:
+                    logger.warning_once(
+                        "Found kv scale in the checkpoint "
+                        f"(e.g. {name}), but not found the expected "
+                        f"name in the model "
+                        f"(e.g. {remapped_kv_scale_name}). "
+                        "kv-scale is not loaded.")
+                    continue
+                else:
+                    name = remapped_kv_scale_name
+                    param = params_dict[name]
+                    loaded_weight = torch.tensor_split(loaded_weight,
+                                                       tp_size,
+                                                       dim=0)[tp_rank]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -124,6 +124,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
         cache_config = vllm_config.cache_config
+        kv_cache_dtype = vllm_config.additional_config.get(
+            "kv_cache_dtype", None)
+        if kv_cache_dtype is not None:
+            vllm_config.cache_config.cache_dtype = kv_cache_dtype
 
         if parallel_config:
             # Default value for expert tensor parallel size
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -98,6 +98,9 @@ def get_quant_method(self, layer: torch.nn.Module,
             'fa_quant_type' in self.quant_description.keys() and \
             self.quant_description['fa_quant_type'] is not None:
             return AscendKVCacheMethod(self, prefix)
+        elif isinstance(layer, Attention) and self.quant_description.get(
+                'kv_quant_type') == 'C8':
+            return AscendKVCacheMethod(self, prefix)
         elif isinstance(layer, FusedMoE):
             if self.is_layer_skipped_ascend(prefix,
                                             self.packed_modules_mapping):
@@ -235,32 +238,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if hasattr(self.quant_method, "process_weights_after_loading"):
             self.quant_method.process_weights_after_loading(layer)
 
-    def apply(self,
-              layer: torch.nn.Module,
-              query: torch.Tensor,
-              key: torch.Tensor,
-              value: torch.Tensor,
-              k_cache: List[torch.Tensor],
-              v_cache: List[torch.Tensor],
-              scale: torch.Tensor,
-              block_tables: torch.Tensor,
-              isPrefill: bool,
-              attn_metadata,
-              output,
-              seq_lens_tensor_cpu: Optional[int] = None) -> torch.Tensor:
-        return self.quant_method.apply(layer,
-                                       query,
-                                       key,
-                                       value,
-                                       k_cache,
-                                       v_cache,
-                                       scale,
-                                       block_tables,
-                                       isPrefill,
-                                       attn_metadata.attn_mask,
-                                       attn_metadata.slot_mapping,
-                                       output,
-                                       seq_lens_tensor_cpu=seq_lens_tensor_cpu)
+    def apply(self, layer: torch.nn.Module, query: torch.Tensor,
+              key: torch.Tensor, value: torch.Tensor, kv_cache, attn_metadata,
+              attn_type, scale, output) -> torch.Tensor:
+        return self.quant_method.apply(layer, query, key, value, kv_cache,
+                                       attn_metadata, attn_type, scale, output)
 
 
 class AscendFusedMoEMethod(FusedMoEMethodBase):
diff --git a/vllm_ascend/quantization/quantizer.py b/vllm_ascend/quantization/quantizer.py
@@ -24,7 +24,8 @@
 
 from .func_wrapper import (wrapper_load_model, wrapper_rmsnorm_forward_oot,
                            wrapper_rmsnorm_init)
-from .w8a8 import AscendW8A8LinearMethod
+from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
+                   AscendW8A8LinearMethod)
 from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
                            AscendW8A8DynamicLinearMethod)
 
@@ -250,6 +251,8 @@ def get_quantizer(cls,
         # Attention
         if '.attn' in prefix and 'fa_quant_type' in quant_description.keys():
             quant_type = quant_description['fa_quant_type']
+        if '.attn' in prefix and 'kv_quant_type' in quant_description.keys():
+            quant_type = quant_description['kv_quant_type']
         # Linear
         else:
             quant_type = cls.get_linear_quant_type(quant_description, prefix,
@@ -269,6 +272,14 @@ class W8A8Quantizer(VLLMAscendQuantizer):
     def build_linear_method():
         return AscendW8A8LinearMethod()
 
+    @staticmethod
+    def build_moe_method():
+        return AscendW8A8FusedMoEMethod()
+
+    @staticmethod
+    def build_attention_method():
+        return AscendC8KVCacheMethod()
+
 
 class W8A8DYNAMICQuantizer(VLLMAscendQuantizer):
 
@@ -284,4 +295,5 @@ def build_moe_method():
 SUPPORT_ASCEND_QUANTIZER_TYPE = {
     "W8A8": W8A8Quantizer,
     "W8A8_DYNAMIC": W8A8DYNAMICQuantizer,
+    "C8": W8A8Quantizer,
 }
diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py