vllm-project
diff --git a/‎examples/offline_dualbatch_overlap_npu.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/offline_dualbatch_overlap_npu.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/run_dp_server.sh‎
Lines changed: 20 additions & 17 deletions b/‎examples/run_dp_server.sh‎
Lines changed: 20 additions & 17 deletions
diff --git a/‎tests/ut/models/test_deepseek_v2.py‎
Lines changed: 12 additions & 2 deletions b/‎tests/ut/models/test_deepseek_v2.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎tests/ut/ops/test_fused_ops.py‎
Lines changed: 40 additions & 15 deletions b/‎tests/ut/ops/test_fused_ops.py‎
Lines changed: 40 additions & 15 deletions
diff --git a/‎vllm_ascend/ascend_forward_context.py‎
Lines changed: 117 additions & 0 deletions b/‎vllm_ascend/ascend_forward_context.py‎
Lines changed: 117 additions & 0 deletions
@@ -21,14 +21,14 @@ def main():
               tensor_parallel_size=2,
               max_model_len=4096,
               trust_remote_code=True,
+              enable_expert_parallel=True,
               additional_config={
                   "torchair_graph_config": {
                       "enabled": False
                   },
                   "ascend_scheduler_config": {
                       "enabled": True
                   },
-                  "expert_tensor_parallel_size": 1
               })
 
     # Generate texts from the prompts. The output is a list of RequestOutput
 
@@ -1,3 +1,7 @@
+rm -rf ./.torchair_cache/
+rm -rf ./dynamo_*
+rm -rf /root/ascend/log/debug/plog/*
+
 export HCCL_IF_IP=2.0.0.0
 export GLOO_SOCKET_IFNAME="enp189s0f0"
 export TP_SOCKET_IFNAME="enp189s0f0"
@@ -6,25 +10,24 @@ export HCCL_SOCKET_IFNAME="enp189s0f0"
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 
-export VLLM_USE_V1=0
-
-export ASCEND_RT_VISIBLE_DEVICES=0,1
-export VLLM_DP_SIZE=2
-export VLLM_DP_RANK=0
-export VLLM_DP_MASTER_IP="2.0.0.0"
-export VLLM_DP_MASTER_PORT=40001
-export VLLM_DP_PROXY_IP="2.0.0.0"
-export VLLM_DP_PROXY_PORT=30002
-export VLLM_DP_MONITOR_PORT=30003
-export VLLM_HTTP_PORT=20001
+export VLLM_USE_V1=1
+export ASCEND_LAUNCH_BLOCKING=0
 
 vllm serve /data/weights/Qwen2.5-0.5B-Instruct \
     --host 0.0.0.0 \
-    --port 20001 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
+    --port 20002 \
     --served-model-name Qwen \
-    --max-model-len 2000 \
-    --max-num-batched-tokens 2000 \
+    --data-parallel-size 4 \
+    --data-parallel-size-local 4 \
+    --data-parallel-address 2.0.0.0 \
+    --data-parallel-rpc-port 13389 \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --no-enable-prefix-caching \
+    --max-num-seqs 16 \
+    --max-model-len 4096 \
+    --max-num-batched-tokens 4096 \
+    --gpu-memory-utilization 0.9 \
     --trust-remote-code \
-    --gpu-memory-utilization 0.9 \
+    --enforce-eager \
+    --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}'
@@ -114,7 +114,16 @@ def mock_distributed():
                   return_value=Mock(is_first_rank=False, is_last_rank=False)), \
             patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
             patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
-                       _PP=pp_group):
+                       _PP=pp_group), \
+            patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group):
+        yield
+
+
+@pytest.fixture
+def mock_forward_context():
+    forward_context = Mock(in_profile_run=False, with_prefill=False)
+    with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
+               return_value=forward_context):
         yield
 
 
@@ -205,7 +214,8 @@ def test_custom_deepseek_v2_mlp(mock_distributed, base_config):
                             quant_config=None)
 
 
-def test_custom_deepseek_v2_moe(mock_distributed, base_config):
+def test_custom_deepseek_v2_moe(mock_distributed, base_config,
+                                mock_forward_context):
     base_config.n_shared_experts = 1
     moe = CustomDeepseekV2MoE(config=base_config,
                               quant_config=None,
 
@@ -18,16 +18,18 @@
 import pytest
 import torch
 import torch.nn as nn
+import torch_npu
 from pytest_mock import MockerFixture
 
+from vllm_ascend.ascend_forward_context import get_fused_moe_state
 from vllm_ascend.ops.fused_moe import (AscendFusedMoE,
                                        AscendUnquantizedFusedMoEMethod)
-from vllm_ascend.utils import adapt_patch  # noqa E402
+from vllm_ascend.utils import AscendSocVersion, adapt_patch  # noqa E402
 
 adapt_patch(True)
 
 
-def mock_ep_group(mocker):
+def mock_ep_and_mc2_group(mocker):
     mock_group = mocker.MagicMock()
     mock_group.rank_in_group = 0
     mock_group.rank = 0
@@ -52,7 +54,8 @@ def mock_dist_env(mocker: MockerFixture):
 
     with patch('torch.distributed.get_rank', return_value=0), \
          patch('torch.distributed.get_world_size', return_value=4), \
-         patch('vllm_ascend.ops.fused_moe.get_ep_group', return_value=mock_ep_group(mocker)), \
+         patch('vllm_ascend.ops.fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
+         patch('vllm_ascend.ops.fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \
          patch('vllm_ascend.ops.fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
          patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
          patch('vllm_ascend.ops.fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
@@ -73,7 +76,7 @@ def mock_dist_env(mocker: MockerFixture):
                return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \
          patch('vllm_ascend.ops.fused_moe.get_forward_context',
                return_value=MagicMock(
-                   attn_metadata=MagicMock(max_num_tokens_across_dp=10),
+                   max_tokens_across_dp=10,
                    dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10])
                )), \
         patch('vllm_ascend.ops.fused_moe.get_current_vllm_config',
@@ -122,7 +125,14 @@ def mock_moe_env(mocker: MockerFixture):
         patch("torch_npu.npu_moe_finalize_routing", return_value=(
                 torch.randn(16, 2)
         )):
-        yield
+        if hasattr(torch_npu, 'npu_moe_distribute_dispatch_v2'):
+            with patch("torch_npu.npu_moe_distribute_dispatch_v2", return_value=(
+                torch.randn(16, 2))), \
+                patch("torch_npu.npu_moe_distribute_combine_v2", return_value=(
+                torch.randn(16, 2))):
+                yield
+        else:
+            yield
 
 
 @pytest.fixture
@@ -237,11 +247,16 @@ def test_forward(self, mock_dist_env, default_moe_config, others_param):
             moe.moe_parallel_config.ep_size = 1
 
         moe.quant_method = MockQuantMethod(shared_experts, num_tokens)
-        output = moe.forward(inputs,
-                             router_logits,
-                             is_prefill=is_prefill,
-                             top_k=top_k,
-                             shared_experts=shared_experts)
+        forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens,
+                                                         dtype=torch.bool),
+                                    padded_num_tokens=num_tokens)
+        with patch("vllm_ascend.ops.fused_moe.get_forward_context",
+                   return_value=forward_context):
+            output = moe.forward(inputs,
+                                 router_logits,
+                                 is_prefill=is_prefill,
+                                 top_k=top_k,
+                                 shared_experts=shared_experts)
 
         moe.quant_method.apply.assert_called_once()
 
@@ -288,15 +303,20 @@ def test_process_weights_after_loading(self, moe_method, mock_dist_env):
     def test_apply_without_expert_map(self, moe_method, mock_dist_env,
                                       mock_moe_env, others_param):
         """
-        1 test is_deepseek_v3_r1=true and use fused_expters_with_all2all
+        1 test is_deepseek_v3_r1=true and use fused_experts_with_all2all
         2 test use_select_experts and fused_experts
         3 test use select_gating_topk_softmax_experts and fused_experts
         4 test use select_experts and fused_experts_with_all2all_buffer
         """
         global_num_experts, ep_size, select_softmax = others_param
+        is_prefill = False
+        is_deepseek_v3_r1 = global_num_experts == 256
+        forward_context = MagicMock(fused_moe_state=get_fused_moe_state(
+            ep_size, is_prefill, is_deepseek_v3_r1))
         with patch(
                 "vllm_ascend.ops.fused_moe.SELECT_GATING_TOPK_SOTFMAX_EXPERTS",
-                select_softmax):
+                select_softmax), \
+             patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context):
             moe_method.ep_size = ep_size
             x = torch.randn(8, 2, 2)
             router_logits = torch.randn(8, 8)
@@ -309,7 +329,7 @@ def test_apply_without_expert_map(self, moe_method, mock_dist_env,
                                       top_k=2,
                                       renormalize=True,
                                       global_num_experts=global_num_experts,
-                                      is_prefill=False)
+                                      is_prefill=is_prefill)
 
             if ep_size == 1:
                 assert result.shape == (16, 2)
@@ -327,8 +347,13 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
         4 test use_select_experts and fused_experts
         """
         ep_size, alltoall_buffer = others_param
+        is_prefill = False
+        forward_context = MagicMock(
+            fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True))
         with patch("vllm_ascend.ops.fused_moe.MOE_ALL2ALL_BUFFER",
-                   alltoall_buffer):
+                   alltoall_buffer), \
+             patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \
+             patch("vllm_ascend.ops.fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
             expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
             moe_method.ep_size = ep_size
             x = torch.randn(8, 2, 2)
@@ -347,7 +372,7 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
                                       renormalize=True,
                                       global_num_experts=128,
                                       expert_map=expert_map,
-                                      is_prefill=False)
+                                      is_prefill=is_prefill)
 
             if ep_size == 16 or ep_size == 1:
                 assert result.shape == (16, 2)
 
@@ -0,0 +1,117 @@
+import math
+from contextlib import contextmanager
+from enum import Enum
+from typing import Any, Optional
+
+import torch
+from vllm.config import VllmConfig
+from vllm.distributed import get_dp_group, get_ep_group, get_tp_group
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.platforms import current_platform
+
+import vllm_ascend.envs as envs
+
+
+class FusedMoEState(Enum):
+    AllGather = 0
+    All2All = 1
+    MC2 = 2
+    AllGatherEP = 3
+    NaiveMulticast = 4
+
+
+# TODO(zzzzwwjj): add soc_version to choose branch
+def get_fused_moe_state(ep_size: int, with_prefill: bool,
+                        is_deepseek_v3_r1: bool):
+    # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
+    # only supports deepseek v3/r1
+    if (envs.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
+            and is_deepseek_v3_r1):
+        return FusedMoEState.AllGatherEP
+    elif ep_size == 1:
+        if with_prefill:
+            return FusedMoEState.NaiveMulticast
+        else:
+            return FusedMoEState.AllGather
+    # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
+    elif ep_size < 16 or with_prefill:
+        return FusedMoEState.All2All
+    else:
+        return FusedMoEState.MC2
+
+
+@contextmanager
+def set_ascend_forward_context(
+    attn_metadata: Any,
+    vllm_config: VllmConfig,
+    virtual_engine: int = 0,
+    num_tokens: Optional[int] = None,
+    num_tokens_across_dp: Optional[torch.Tensor] = None,
+    with_prefill: bool = True,
+    in_profile_run: bool = False,
+    num_actual_tokens: Optional[int] = None,
+):
+    """A context manager that stores the current forward context,
+    can be attention metadata, etc.
+    We add some additional param into forward_context.
+    """
+    with set_forward_context(attn_metadata,
+                             vllm_config,
+                             virtual_engine=virtual_engine,
+                             num_tokens=num_tokens,
+                             num_tokens_across_dp=num_tokens_across_dp):
+        forward_context = get_forward_context()
+        forward_context.with_prefill = with_prefill
+        ep_size = (get_ep_group().world_size if
+                   vllm_config.parallel_config.enable_expert_parallel else 1)
+
+        is_deepseek_v3_r1 = hasattr(
+            vllm_config.model_config.hf_config, 'n_routed_experts'
+        ) and vllm_config.model_config.hf_config.n_routed_experts == 256
+        fused_moe_state = get_fused_moe_state(ep_size, with_prefill,
+                                              is_deepseek_v3_r1)
+
+        forward_context.fused_moe_state = fused_moe_state
+
+        forward_context.in_profile_run = in_profile_run
+
+        # NOTE: This cannot be set using set_forward_context
+        # due to multiple warmups before actual capturing
+        forward_context.capturing = False
+
+        if num_tokens is None and attn_metadata is not None:
+            if hasattr(attn_metadata, 'num_actual_tokens'):
+                # for v1 engine
+                num_tokens = attn_metadata.num_actual_tokens
+            else:
+                # for v0 engine
+                num_tokens = attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
+
+        if num_actual_tokens is None:
+            num_actual_tokens = num_tokens
+
+        dp_world_size = get_dp_group().world_size
+        if dp_world_size > 1 and forward_context.dp_metadata is not None:
+            max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item(
+            )
+        else:
+            max_tokens_across_dp = num_tokens
+
+        forward_context.max_tokens_across_dp = max_tokens_across_dp
+
+        if num_tokens is not None:
+            tp_world_size = get_tp_group().world_size
+            # NOTE: token num which need to pad to when mc2
+            forward_context.padded_num_tokens = math.ceil(
+                max_tokens_across_dp / tp_world_size) * tp_world_size
+
+            mc2_mask = torch.zeros(forward_context.padded_num_tokens,
+                                   dtype=torch.bool,
+                                   device=current_platform.device_type)
+            mc2_mask[:num_actual_tokens] = True
+            forward_context.mc2_mask = mc2_mask
+
+        try:
+            yield
+        finally:
+            pass