apply npu_moe_gating_top_k_softmax

loukong33 · loukong33 · commit 562a0017a40d · 2025-07-21T21:28:19.000+08:00
Signed-off-by: huangxialu &lt;huangxialu1@huawei.com&gt;
diff --git a/tests/singlecard/ops/test_fused_moe.py b/tests/singlecard/ops/test_fused_moe.py
@@ -23,11 +23,15 @@
 # here to make the test pass.
 import vllm_ascend.patch.worker.patch_common.patch_utils  # type: ignore[import]  # isort: skip  # noqa
 
+from unittest.mock import MagicMock, patch
+
 import pytest
 import torch
 from vllm.model_executor.layers.activation import SiluAndMul
 
-from vllm_ascend.ops.fused_moe import fused_experts
+from vllm_ascend.ascend_forward_context import FusedMoEState
+from vllm_ascend.ops.fused_moe import (AscendUnquantizedFusedMoEMethod,
+                                       fused_experts)
 
 NUM_EXPERTS = [8, 64]
 EP_SIZE = [1, 4]
@@ -98,3 +102,119 @@ def test_fused_experts(
     # TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem
     torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
     torch.npu.empty_cache()
+
+
+@pytest.mark.parametrize("m", [1, 33, 64])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 511])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("renormalize", [True, False])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("device", DEVICE)
+def test_ascend_unquantized_fused_moe_softmax(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    renormalize: bool,
+    dtype: torch.dtype,
+    device: str,
+):
+
+    class MockVllmConfig:
+
+        @property
+        def scheduler_config(self):
+
+            class SchedulerConfig:
+                max_num_seqs = 256
+
+            return SchedulerConfig()
+
+        @property
+        def model_config(self):
+
+            class ModelConfig:
+                max_model_len = 2048
+
+            return ModelConfig()
+
+    class MockAscendConfig:
+
+        @property
+        def torchair_graph_config(self):
+
+            class TorchairGraphConfig:
+                enabled = False
+
+            return TorchairGraphConfig()
+
+    class MockMC2Group:
+
+        @property
+        def device_group(self):
+            return MagicMock()
+
+    class MockForwardContext:
+
+        @property
+        def fused_moe_state(self):
+            return FusedMoEState.AllGather
+
+    class MockLayer(torch.nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.w13_weight = torch.randn(
+                (e, 2 * n, k), device=device, dtype=dtype) / 10
+            self.w2_weight = torch.randn(
+                (e, k, n), device=device, dtype=dtype) / 10
+
+    x = torch.randn((m, k), device=device, dtype=dtype) / 10
+    router_logits = torch.randn((m, e), device=device, dtype=dtype)
+
+    with patch('vllm_ascend.ops.fused_moe.get_current_vllm_config') as mock_get_vllm_config, \
+            patch('vllm_ascend.ops.fused_moe.get_ascend_config') as mock_get_ascend_config, \
+            patch('vllm_ascend.ops.fused_moe.get_mc2_group') as mock_get_mc2_group, \
+            patch('vllm_ascend.ops.fused_moe.get_forward_context') as mock_get_context, \
+            patch('vllm_ascend.ops.fused_moe.fused_experts') as mock_fused_experts, \
+            patch('torch.distributed.get_rank') as mock_get_rank:
+        mock_get_vllm_config.return_value = MockVllmConfig()
+        mock_get_ascend_config.return_value = MockAscendConfig()
+        mock_get_mc2_group.return_value = MockMC2Group()
+        mock_get_context.return_value = MockForwardContext()
+        mock_fused_experts.return_value = torch.zeros_like(x)
+
+        mock_get_rank.side_effect = AttributeError("mock error")
+
+        method = AscendUnquantizedFusedMoEMethod()
+        layer = MockLayer()
+
+        output = method.apply(
+            layer=layer,
+            x=x,
+            router_logits=router_logits,
+            top_k=topk,
+            renormalize=renormalize,
+            scoring_func="softmax",
+            global_num_experts=e,
+        )
+
+        assert method.moe_all_to_all_group_name is None
+
+        assert mock_fused_experts.called
+        call_args = mock_fused_experts.call_args[1]
+
+        topk_weights = call_args['topk_weights']
+        topk_ids = call_args['topk_ids']
+
+        assert topk_weights.shape == (m, topk)
+        assert topk_ids.shape == (m, topk)
+        assert output.shape == (m, k)
+        assert topk_weights.dtype == dtype
+        assert topk_ids.dtype == torch.int32
+        assert output.dtype == dtype
+
+    torch.npu.empty_cache()
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -1005,6 +1005,12 @@ def apply(
                 routed_scaling_factor=1,
                 eps=float(1e-20),
             )
+        elif scoring_func == "softmax":
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k_softmax(
+                x=router_logits, finished=None, k=top_k)
+            if renormalize:
+                topk_weights = topk_weights / topk_weights.sum(dim=-1,
+                                                               keepdim=True)
         else:
             topk_weights, topk_ids = select_experts(
                 hidden_states=x,