From 49a3249e17c4271dfa19d843d7f9102f61d09282 Mon Sep 17 00:00:00 2001 From: ApsarasX Date: Thu, 26 Jun 2025 05:13:54 +0000 Subject: [PATCH 1/2] [Bugfix] Support Qwen3-MOE on aclgraph mode Co-authored-by: Yizhou Liu Signed-off-by: ApsarasX --- vllm_ascend/ops/common_fused_moe.py | 18 +++++++++++++++++- vllm_ascend/ops/fused_moe.py | 4 +++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index 4e21c744ae..3aa23a2a6e 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -18,6 +18,7 @@ from typing import Callable, Optional import torch +from vllm.config import CompilationLevel, get_current_vllm_config from vllm.model_executor.layers.fused_moe.layer import \ UnquantizedFusedMoEMethod @@ -25,6 +26,15 @@ select_experts) from vllm_ascend.utils import is_310p +original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__ + + +def unquantized_fused_moe_init_func(self, *args, **kwargs): + original_unquantized_fused_moe_init_func(self, *args, **kwargs) + vllm_config = get_current_vllm_config() + self.max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens + self.use_aclgraph = vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not vllm_config.model_config.enforce_eager + def forward_oot( self, @@ -71,6 +81,10 @@ def forward_oot( expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input) + # If use aclgraph, we need to set max_num_tokens to make + # the input shape of `npu_moe_init_routing` fixed + max_num_tokens = self.max_num_batched_tokens if self.use_aclgraph else None + return fused_experts( hidden_states=x, w1=layer.w13_weight, @@ -79,7 +93,9 @@ def forward_oot( topk_ids=topk_ids, top_k=top_k, expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input) + apply_router_weight_on_input=apply_router_weight_on_input, + max_num_tokens=max_num_tokens) +UnquantizedFusedMoEMethod.__init__ = unquantized_fused_moe_init_func UnquantizedFusedMoEMethod.forward_oot = forward_oot diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index c9fd8f2dd1..da5e8e3cbf 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -655,6 +655,7 @@ def fused_experts( top_k: int, expert_map: torch.Tensor = None, apply_router_weight_on_input: bool = False, + max_num_tokens: Optional[int] = None, ) -> torch.Tensor: """ Fused experts with top-k routing. @@ -748,11 +749,12 @@ def fused_experts( dtype=torch.int32, device=device).view(top_k, -1).permute( 1, 0).contiguous()) + active_num = max_num_tokens if max_num_tokens is not None else num_tokens sorted_hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( hidden_states, row_idx=row_idx, expert_idx=topk_ids, - active_num=num_tokens) + active_num=active_num) expert_tokens = torch_npu.npu_moe_compute_expert_tokens( expanded_expert_idx, num_experts) From 25f1182e6c55a0321fa2b0fa182596190bcfc41a Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Sun, 6 Jul 2025 10:35:52 +0800 Subject: [PATCH 2/2] Add aclgraph e2e test for Qwen3-30B-A3B Signed-off-by: Yikun Jiang --- tests/e2e/singlecard/test_aclgraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py index e0bfb65cf8..4fc23aa7b3 100644 --- a/tests/e2e/singlecard/test_aclgraph.py +++ b/tests/e2e/singlecard/test_aclgraph.py @@ -29,7 +29,7 @@ from tests.conftest import VllmRunner from tests.model_utils import check_outputs_equal -MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] +MODELS = ["Qwen/Qwen2.5-0.5B-Instruct", "vllm-ascend/Qwen3-30B-A3B-Puring"] @pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",