Refactor scattered w8a8 dynamic quantization operations

sdmyzlp · sdmyzlp · commit 6f8239cc61f5 · 2025-06-10T20:20:14.000+08:00
AscendW8A8DynamicLinearMethod is integrated into CustomDeepseekV2MLP
in a very awkward way, causing scattered quantization operations all
over the model scripts. Refactor to solve this problem.

Signed-off-by: sdmyzlp &lt;lrwei2@petalmail.com&gt;
diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py
@@ -29,7 +29,7 @@
 
 import torch
 import torch.distributed as dist
-import torch_npu
+import torch_npu  # noqa: F401
 import vllm.envs as envs
 from torch import nn
 from transformers import PretrainedConfig
@@ -40,13 +40,10 @@
                               get_tp_group, tensor_model_parallel_all_reduce)
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
                                                ReplicatedLinear,
-                                               RowParallelLinear,
-                                               UnquantizedLinearMethod)
+                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -67,6 +64,7 @@
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2MLP
 from vllm_ascend.multistream.base import MSEventKey
 from vllm_ascend.multistream.context import (
     advance_step_multistream_layer_context, get_multistream_comm_context,
@@ -78,117 +76,17 @@
                                               make_multistream_metadata_ds)
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
-from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import dispose_tensor
 
 VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 
 
-class CustomDeepseekDBOMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
-        reduce_results: bool = True,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-        # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant
-        self.is_dynamic_quant = not isinstance(
-            self.gate_up_proj.quant_method,
-            UnquantizedLinearMethod) and isinstance(
-                self.gate_up_proj.quant_method.quant_method,
-                AscendW8A8DynamicLinearMethod)
-
-    def forward(self, x):
-        if self.is_dynamic_quant:
-            x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.gate_up_proj.weight,
-                self.gate_up_proj.weight_scale,
-                output_dtype=torch.int32,
-            )
-            x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
-                x=x,
-                weight_scale=self.gate_up_proj.weight_scale_fp32,
-                activation_scale=dynamic_scale,
-                bias=None,
-                quant_scale=None,
-                quant_offset=None,
-                group_index=None,
-                activate_left=True,
-                quant_mode=1)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.down_proj.weight,
-                self.down_proj.weight_scale,
-                pertoken_scale=dynamic_scale,
-                output_dtype=torch.bfloat16,
-            )
-            if self.down_proj.reduce_results and self.down_proj.tp_size > 1:
-                x = tensor_model_parallel_all_reduce(x)
-            return x
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
+class CustomDeepseekDBOMLP(CustomDeepseekV2MLP):
 
     def _forward_ms_mlp(self, x):
         current_ms_metadata = get_multistream_comm_context()
         assert current_ms_metadata is not None
-        if self.is_dynamic_quant:
-            x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.gate_up_proj.weight,
-                self.gate_up_proj.weight_scale,
-                output_dtype=torch.int32,
-            )
-            x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
-                x=x,
-                weight_scale=self.gate_up_proj.weight_scale_fp32,
-                activation_scale=dynamic_scale,
-                bias=None,
-                quant_scale=None,
-                quant_offset=None,
-                group_index=None,
-                activate_left=True,
-                quant_mode=1)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.down_proj.weight,
-                self.down_proj.weight_scale,
-                pertoken_scale=dynamic_scale,
-                output_dtype=torch.bfloat16,
-            )
-            if self.down_proj.reduce_results and self.down_proj.tp_size > 1:
-                current_ms_metadata.before_comm_event.record()
-                with torch.npu.stream(current_ms_metadata.comm_stream):
-                    current_ms_metadata.before_comm_event.wait()
-                    x = tensor_model_parallel_all_reduce(x)
-                    current_ms_metadata.after_comm_event.record()
-            return x
         gate_up, _ = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
         current_ms_metadata.before_comm_event.record()
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -25,7 +25,7 @@
 # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py
 # """Inference-only DeepseekV2/DeepseekV3 model."""
 
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -69,12 +69,38 @@
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import get_ep_group
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
+from vllm_ascend.quantization.quant_config import AscendLinearMethod
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import dispose_tensor
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 
 
+class CustomDeepseekV2SiluAndMul(SiluAndMul):
+
+    def __init__(self,
+                 *,
+                 weight_scale: Optional[Callable[[], torch.Tensor]] = None):
+        super().__init__()
+        self.weight_scale = weight_scale
+
+    def forward_oot(self, x: Union[torch.Tensor, Tuple[torch.Tensor,
+                                                       torch.Tensor]]):
+        if isinstance(x, tuple):
+            assert self.weight_scale is not None
+            # For AscendW8A8DynamicLinearMethod:
+            # a dynamic scale is passed along with the quantized value.
+            quantized_x, dynamic_scale = x
+            return torch_npu.npu_dequant_swiglu_quant(
+                x=quantized_x,
+                weight_scale=self.weight_scale(),
+                activation_scale=dynamic_scale,
+                activate_left=True,
+                quant_mode=1)
+        else:
+            return super().forward_oot(x)
+
+
 class CustomDeepseekV2MLP(nn.Module):
 
     def __init__(
@@ -101,44 +127,36 @@ def __init__(
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
 
-        # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant
-        self.is_dynamic_quant = not isinstance(
-            self.gate_up_proj.quant_method,
-            UnquantizedLinearMethod) and isinstance(
-                self.gate_up_proj.quant_method.quant_method,
-                AscendW8A8DynamicLinearMethod)
+        quant_method = self.gate_up_proj.quant_method
+        if isinstance(quant_method, UnquantizedLinearMethod):
+            self.act_fn = CustomDeepseekV2SiluAndMul()
+        elif (isinstance(quant_method, AscendLinearMethod) and isinstance(
+                quant_method.quant_method, AscendW8A8DynamicLinearMethod)):
+            # TODO(sdmyzlp): Currently preserved as before:
+            # 1. The only quantization supported for silu is W8A8Dynamic
+            # 2. Output dtype of gate_up/down is fixed to be int32/bfloat16
+            #
+            # Maybe one can implement a better and more general configuration
+            # scheme, e.g. by somehow passing around the tweaked `quant_config`
+            self.act_fn = CustomDeepseekV2SiluAndMul(
+                # Use lazy binding, for `weight_scale_fp32` is accessible
+                # only after `process_weights_after_loading`.
+                weight_scale=lambda: self.gate_up_proj.weight_scale_fp32)
+            # To be consumed by AscendW8A8DynamicLinearMethod.apply()
+            self.gate_up_proj._dynamic_quant_config = {
+                "output_dtype": torch.int32,
+                "return_scale": True,
+            }
+            self.down_proj._dynamic_quant_config = {
+                "output_dtype": torch.bfloat16,
+                "return_scale": False,
+            }
+        else:
+            raise NotImplementedError(
+                f"Quantization with [{type(quant_method)}] is NOT supported")
 
     def forward(self, x):
-        if self.is_dynamic_quant:
-            x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.gate_up_proj.weight,
-                self.gate_up_proj.weight_scale,
-                output_dtype=torch.int32,
-            )
-            x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
-                x=x,
-                weight_scale=self.gate_up_proj.weight_scale_fp32,
-                activation_scale=dynamic_scale,
-                bias=None,
-                quant_scale=None,
-                quant_offset=None,
-                group_index=None,
-                activate_left=True,
-                quant_mode=1)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.down_proj.weight,
-                self.down_proj.weight_scale,
-                pertoken_scale=dynamic_scale,
-                output_dtype=torch.bfloat16,
-            )
-            if self.down_proj.reduce_results and self.down_proj.tp_size > 1:
-                x = tensor_model_parallel_all_reduce(x)
-            return x
         gate_up, _ = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(x)
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -15,13 +15,13 @@
 # limitations under the License.
 #
 
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
 import torch_npu
 import torchair as tng  # type: ignore
-from vllm.distributed import GroupCoordinator, tensor_model_parallel_all_reduce
+from vllm.distributed import GroupCoordinator
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
@@ -77,19 +77,9 @@ def apply_mlp(hidden_states: torch.Tensor,
     shared_experts = kwargs.get('shared_experts', None)
     if shared_experts:
         shared_gate_up = kwargs.get('shared_gate_up', None)
-        shared_dynamic_scale = kwargs.get('shared_dynamic_scale', None)
         with tng.scope.npu_stream_switch('cv'):
-            tng.scope.npu_wait_tensor(shared_gate_up, hidden_states)
-            shared_x, shared_dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
-                x=shared_gate_up,
-                weight_scale=shared_experts.gate_up_proj.weight_scale_fp32,
-                activation_scale=shared_dynamic_scale,
-                bias=None,
-                quant_scale=None,
-                quant_offset=None,
-                group_index=None,
-                activate_left=True,
-                quant_mode=1)
+            tng.scope.npu_wait_tensor(shared_gate_up[0], hidden_states)
+            shared_act = shared_experts.act_fn(shared_gate_up)
 
     # gmm1: gate_up_proj
     hidden_states = torch_npu.npu_grouped_matmul(
@@ -122,16 +112,9 @@ def apply_mlp(hidden_states: torch.Tensor,
 
     if shared_experts:
         with tng.scope.npu_stream_switch('cv'):
-            tng.scope.npu_wait_tensor(shared_x, hidden_states)
-            shared_output = torch_npu.npu_quant_matmul(
-                shared_x,
-                shared_experts.down_proj.weight,
-                shared_experts.down_proj.weight_scale,
-                pertoken_scale=shared_dynamic_scale,
-                output_dtype=torch.bfloat16,
-            )
-            if shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1:
-                shared_output = tensor_model_parallel_all_reduce(shared_output)
+            tng.scope.npu_wait_tensor(shared_act[0], hidden_states)
+            shared_output, _ = shared_experts.down_proj(shared_act)
+
     if shared_experts:
         return hidden_states, shared_output
     return hidden_states
@@ -193,17 +176,10 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
         shared_hidden_states = kwargs.get('shared_hidden_states', None)
         with tng.scope.npu_stream_switch('cv'):
             tng.scope.npu_wait_tensor(shared_hidden_states, hidden_states)
-            shared_x, shared_dynamic_scale = torch_npu.npu_dynamic_quant(
+            shared_gate_up, _ = shared_experts.gate_up_proj(
                 shared_hidden_states)
-            shared_gate_up = torch_npu.npu_quant_matmul(
-                shared_x,
-                shared_experts.gate_up_proj.weight,
-                shared_experts.gate_up_proj.weight_scale,
-                output_dtype=torch.int32,
-            )
         kwargs.update({
             "shared_gate_up": shared_gate_up,
-            "shared_dynamic_scale": shared_dynamic_scale,
         })
 
     output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
@@ -541,21 +517,31 @@ def get_perchannel_param(
     @staticmethod
     def apply(
         layer: torch.nn.Module,
-        x: torch.Tensor,
+        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         bias: Optional[torch.Tensor] = None,
         tp_rank: Optional[int] = 0,
     ) -> torch.Tensor:
-        original_dtype = x.dtype
-        # use ATB quantize
-        quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
-        return torch_npu.npu_quant_matmul(
-            quant_out,
+        config = getattr(layer, "_dynamic_quant_config", {})
+        if not isinstance(x, tuple):
+            output_dtype = config.get("output_dtype", x.dtype)
+            quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
+        else:
+            assert "output_dtype" in config.keys(), (
+                f"DynamicLinearMethod needs explicitly specified `output_dtype`"
+                f"for pre-quantized input, got config [{config}]")
+            output_dtype = config["output_dtype"]
+            quantized_x, dynamic_scale = x
+
+        output = torch_npu.npu_quant_matmul(
+            quantized_x,
             layer.weight,
             layer.weight_scale,
             pertoken_scale=dynamic_scale,
             bias=bias,
-            output_dtype=original_dtype,
+            output_dtype=output_dtype,
         )
+        return ((output, dynamic_scale)
+                if config.get("return_scale", False) else output)
 
     def process_weights_after_loading(self, layer):
         if self.transpose_weight: