Merge branch 'main' into dev_multistream_overlap

zxdukki · web-flow · commit 20d1afc55d7a · 2025-06-06T16:08:23.000+08:00
diff --git a/docs/source/developer_guide/evaluation/index.md b/docs/source/developer_guide/evaluation/index.md
@@ -12,4 +12,5 @@ using_evalscope
 :caption: Performance
 :maxdepth: 1
 performance_benchmark
+profile_execute_duration
 :::
diff --git a/docs/source/developer_guide/evaluation/profile_execute_duration.md b/docs/source/developer_guide/evaluation/profile_execute_duration.md
@@ -0,0 +1,34 @@
+# Profile Execute Duration
+
+The execution duration of each stage (including pre/post-processing, model forward, etc.) usually needs to be captured during a complete inference process. Typically, this is done by using `torch.npu.synchronize()` and obtaining CPU timestamps, which increases the performance overhead of host/device synchronization.
+
+**To reduce the performance overhead, we add this feature, using the NPU event timestamp mechanism to observe the device execution time asynchronously.**
+
+## Usage
+* Use the environment variable `VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` to enable this feature.
+* Use the non-blocking API `ProfileExecuteDuration().capture_async` to set observation points asynchronously when you need to observe the execution duration.
+* Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an appropriate time to get and print the execution durations of all observed stages.
+
+## Example Output
+
+```
+5691:(IntegratedWorker pid=1502285) Profile execute duration [Decode]: [post process]:14.17ms [prepare input and forward]:9.57ms [forward]:4.14ms
+5695:(IntegratedWorker pid=1502285) Profile execute duration [Decode]: [post process]:14.29ms [prepare input and forward]:10.19ms [forward]:4.14ms
+5697:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.81ms [prepare input and forward]:10.29ms [forward]:3.99ms
+5701:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.10ms [prepare input and forward]:10.62ms [forward]:4.33ms
+5705:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.65ms [prepare input and forward]:9.58ms [forward]:4.20ms
+5709:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.43ms [prepare input and forward]:9.88ms [forward]:4.20ms
+5711:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.89ms [prepare input and forward]:10.49ms [forward]:4.19ms
+5715:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.14ms [prepare input and forward]:11.21ms [forward]:4.18ms
+5719:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.71ms [prepare input and forward]:10.15ms [forward]:4.42ms
+5723:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.62ms [prepare input and forward]:10.31ms [forward]:4.25ms
+5725:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.12ms [prepare input and forward]:10.33ms [forward]:4.24ms
+5729:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.58ms [prepare input and forward]:10.85ms [forward]:4.32ms
+5733:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.32ms [prepare input and forward]:9.79ms [forward]:4.28ms
+5737:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:15.06ms [prepare input and forward]:9.89ms [forward]:4.32ms
+5739:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.62ms [prepare input and forward]:10.48ms [forward]:4.27ms
+5743:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.60ms [prepare input and forward]:10.71ms [forward]:4.61ms
+5747:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.21ms [prepare input and forward]:10.10ms [forward]:4.52ms
+5751:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:15.03ms [prepare input and forward]:10.00ms [forward]:4.42ms
+
+```
diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py
@@ -114,5 +114,6 @@ def test_ascend_config_load_error():
             },
         }
         with VllmRunner("facebook/opt-125m",
+                        enforce_eager=False,
                         additional_config=input_additional_config_fake_2):
             pass
diff --git a/tests/singlecard/test_profile_execute_duration.py b/tests/singlecard/test_profile_execute_duration.py
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import time
+from unittest.mock import patch
+
+import torch
+import vllm  # noqa: F401
+
+from vllm_ascend.utils import ProfileExecuteDuration
+
+
+@patch.dict(os.environ, {"VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE": "1"})
+def test_execue_duration_enabled_discrepancy():
+    a = torch.randn(10000, 10000).npu()
+    b = torch.randn(10000, 10000).npu()
+
+    # warmup
+    torch.matmul(a, b)
+    torch.npu.synchronize()
+
+    cpu_start = time.perf_counter()
+    with ProfileExecuteDuration().capture_async("forward"):
+        torch.matmul(a, b)
+        torch.npu.synchronize()
+        cpu_duration = (time.perf_counter() - cpu_start) * 1000
+    npu_durations = ProfileExecuteDuration().pop_captured_sync()
+    assert npu_durations and 'forward' in npu_durations
+    assert not ProfileExecuteDuration._observations
+
+    # Assert discrepancy between CPU and NPU duration is within 50% roughly
+    diff = abs(cpu_duration - npu_durations['forward']) / max(
+        cpu_duration, npu_durations['forward'])
+    assert diff <= 0.5, (
+        f"CPU={cpu_duration:.2f}ms, NPU={npu_durations['forward']:.2f}ms")
+
+
+def test_execue_duration_disabled():
+    a = torch.randn(100, 100).npu()
+    b = torch.randn(100, 100).npu()
+
+    with ProfileExecuteDuration().capture_async("forward"):
+        torch.matmul(a, b)
+        torch.npu.synchronize()
+    npu_durations = ProfileExecuteDuration().pop_captured_sync()
+    assert not npu_durations
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -53,6 +53,8 @@ def __init__(self, torchair_graph_config):
             "graph_batch_sizes", [])
         self.graph_batch_sizes_init = torchair_graph_config.get(
             "graph_batch_sizes_init", False)
+        self.enable_multistream_shared_expert = torchair_graph_config.get(
+            "enable_multistream_shared_expert", False)
 
         if not isinstance(self.graph_batch_sizes, list):
             raise TypeError("graph_batch_sizes must be list[int]")
@@ -105,7 +107,7 @@ def check_ascend_config(vllm_config, enforce_eager):
     ascend_config = get_ascend_config()
 
     # Both for V0 and V1 Engine, torchair_graph cannot be enabled with eager mode.
-    if ascend_config.torchair_graph_config.enabled and not enforce_eager:
+    if ascend_config.torchair_graph_config.enabled and enforce_eager:
         raise RuntimeError(
             "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
         )
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -27,47 +27,84 @@
 # begin-env-vars-definition
 
 env_variables: Dict[str, Callable[[], Any]] = {
-    # max compile thread num
+    # max compile thread number for package building. Usually, it is set to
+    # the number of CPU cores. If not set, the default value is None, which
+    # means all number of CPU cores will be used.
     "MAX_JOBS":
     lambda: os.getenv("MAX_JOBS", None),
+    # The build type of the package. It can be one of the following values:
+    # Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
     "CMAKE_BUILD_TYPE":
     lambda: os.getenv("CMAKE_BUILD_TYPE"),
+    # Whether to compile custom kernels. If not set, the default value is True.
+    # If set to False, the custom kernels will not be compiled. Please note that
+    # the sleep mode feature will be disabled as well if custom kernels are not
+    # compiled.
     "COMPILE_CUSTOM_KERNELS":
     lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
+    # The CXX compiler used for compiling the package. If not set, the default
+    # value is None, which means the system default CXX compiler will be used.
+    "CXX_COMPILER":
+    lambda: os.getenv("CXX_COMPILER", None),
+    # The C compiler used for compiling the package. If not set, the default
+    # value is None, which means the system default C compiler will be used.
+    "C_COMPILER":
+    lambda: os.getenv("C_COMPILER", None),
+    # Whether to enable MC2 for DeepSeek. If not set, the default value is False.
+    # MC2 is a fusion operator provided by Ascend to speed up computing and communication.
+    # Find more detail here: https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/atlas_ascendc_best_practices_10_0043.html
     "VLLM_ENABLE_MC2":
     lambda: bool(int(os.getenv("VLLM_ENABLE_MC2", '0'))),
+    # Whether to enable the topk optimization. It's disabled by default for experimental support
+    # We'll make it enabled by default in the future.
     "VLLM_ASCEND_ENABLE_TOPK_OPTIMZE":
     lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_OPTIMZE", '0'))),
+    # Whether to use LCCL communication. If not set, the default value is False.
     "USING_LCCL_COM":
     lambda: bool(int(os.getenv("USING_LCCL_COM", '0'))),
+    # The version of the Ascend chip. If not set, the default value is
+    # ASCEND910B1. It's used for package building. Please make sure that the
+    # version is correct.
     "SOC_VERSION":
     lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
     # If set, vllm-ascend will print verbose logs during compilation
     "VERBOSE":
     lambda: bool(int(os.getenv('VERBOSE', '0'))),
+    # The home path for CANN toolkit. If not set, the default value is
+    # /usr/local/Ascend/ascend-toolkit/latest
     "ASCEND_HOME_PATH":
     lambda: os.getenv("ASCEND_HOME_PATH", None),
-    "LD_LIBRARY_PATH":
-    lambda: os.getenv("LD_LIBRARY_PATH", None),
-    # Used for disaggregated prefilling
+    # The path for HCCN Tool, the tool will be called by disaggregated prefilling
+    # case.
     "HCCN_PATH":
     lambda: os.getenv("HCCN_PATH", "/usr/local/Ascend/driver/tools/hccn_tool"),
+    # The path for HCCL library, it's used by pyhccl communicator backend. If
+    # not set, the default value is libhccl.so。
     "HCCL_SO_PATH":
+    # The prefill device id for disaggregated prefilling case.
     lambda: os.environ.get("HCCL_SO_PATH", None),
     "PROMPT_DEVICE_ID":
     lambda: os.getenv("PROMPT_DEVICE_ID", None),
+    # The decode device id for disaggregated prefilling case.
     "DECODE_DEVICE_ID":
     lambda: os.getenv("DECODE_DEVICE_ID", None),
+    # The port number for llmdatadist communication. If not set, the default
+    # value is 26000.
     "LLMDATADIST_COMM_PORT":
     lambda: os.getenv("LLMDATADIST_COMM_PORT", "26000"),
+    # The wait time for llmdatadist sync cache. If not set, the default value is
+    # 5000ms.
     "LLMDATADIST_SYNC_CACHE_WAIT_TIME":
     lambda: os.getenv("LLMDATADIST_SYNC_CACHE_WAIT_TIME", "5000"),
-    "CXX_COMPILER":
-    lambda: os.getenv("CXX_COMPILER", None),
-    "C_COMPILER":
-    lambda: os.getenv("C_COMPILER", None),
+    # The version of vllm is installed. This value is used for developers who
+    # installed vllm from source locally. In this case, the version of vllm is
+    # usually changed. For example, if the version of vllm is "0.9.0", but when
+    # it's installed from source, the version of vllm is usually set to "0.9.1".
+    # In this case, developers need to set this value to "0.9.0" to make sure
+    # that the correct package is installed.
     "VLLM_VERSION":
     lambda: os.getenv("VLLM_VERSION", None),
+    # Whether to enable the trace recompiles from pytorch.
     "VLLM_ASCEND_TRACE_RECOMPILES":
     lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),
     "VLLM_ASCEND_ENABLE_DBO":
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -216,6 +216,8 @@ def __init__(
 
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_multistream_shared_expert = \
+            ascend_config.torchair_graph_config.enable_multistream_shared_expert
 
     def forward(
             self,
@@ -238,6 +240,8 @@ def forward(
 
         num_tokens, hidden_size = hidden_states.shape
 
+        multistream = self.enable_multistream_shared_expert and not is_prefill
+
         old_hidden_states = hidden_states.clone()
 
         if self.tp_size > 1:
@@ -259,13 +263,25 @@ def forward(
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
+        kwargs = {}
+        if multistream:
+            kwargs.update({
+                "shared_experts": self.shared_experts,
+                "shared_hidden_states": old_hidden_states
+            })
+
         hidden_states = self.experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
             is_prefill=is_prefill,
             top_k=CustomDeepseekV2MoE.top_k,
             enable_force_load_balance=enable_force_load_balance,
-        ) * self.routed_scaling_factor
+            **kwargs)
+
+        if multistream:
+            hidden_states, shared_output = hidden_states
+
+        hidden_states = hidden_states * self.routed_scaling_factor
 
         if self.tp_size > 1:
             if self.torchair_graph_enabled:
@@ -288,7 +304,8 @@ def forward(
                     hidden_states = hidden_states[:-num_padding_tokens]
 
         if self.n_shared_experts is not None:
-            shared_output = self.shared_experts(old_hidden_states)
+            if not multistream:
+                shared_output = self.shared_experts(old_hidden_states)
 
         if shared_output is not None:
             hidden_states = hidden_states + shared_output
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -39,19 +39,18 @@
 USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM
 
 
-def fused_experts_with_mc2(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    top_k: int,
-    expert_map: torch.Tensor = None,
-    moe_all_to_all_group_name: Optional[str] = None,
-) -> torch.Tensor:
+def fused_experts_with_mc2(hidden_states: torch.Tensor,
+                           w1: torch.Tensor,
+                           w2: torch.Tensor,
+                           topk_weights: torch.Tensor,
+                           topk_ids: torch.Tensor,
+                           top_k: int,
+                           expert_map: torch.Tensor = None,
+                           moe_all_to_all_group_name: Optional[str] = None,
+                           **kwargs) -> torch.Tensor:
     global_bs = 0
     moe_expert_num = len(expert_map)
-    kwargs = {
+    kwargs_mc2 = {
         "x": hidden_states,
         "expert_ids": topk_ids,
         "expert_shard_type": 0,
@@ -81,9 +80,9 @@ def fused_experts_with_mc2(
         "tp_world_size": tp_size,
         "tp_rank_id": tp_rank,
     }
-    kwargs.update(stage1_kwargs)
+    kwargs_mc2.update(stage1_kwargs)
 
-    output = torch_npu.npu_moe_distribute_dispatch(**kwargs)
+    output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
     # comm_stream.wait_stream(torch.npu.current_stream())
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
@@ -119,7 +118,7 @@ def fused_experts_with_mc2(
     down_out_list = torch.cat(down_out_list, dim=0)
 
     # moeCombine
-    kwargs = {
+    kwargs_mc2 = {
         "expand_x": down_out_list,
         "expert_ids": topk_ids,
         "expand_idx": expand_idx,
@@ -141,9 +140,9 @@ def fused_experts_with_mc2(
         "tp_world_size": tp_size,
         "tp_rank_id": tp_rank,
     }
-    kwargs.update(stage3_kwargs)
+    kwargs_mc2.update(stage3_kwargs)
 
-    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs)
+    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
 
     return hidden_states
 
@@ -675,7 +674,8 @@ def apply(
                 topk_ids=topk_ids,
                 top_k=top_k,
                 expert_map=expert_map,
-                moe_all_to_all_group_name=self.moe_all_to_all_group_name)
+                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                **kwargs)
         elif self.torchair_graph_enabled or get_ep_group().world_size == 1:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,
@@ -772,6 +772,8 @@ def __init__(
 
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_multistream_shared_expert = \
+            ascend_config.torchair_graph_config.enable_multistream_shared_expert
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
@@ -818,7 +820,8 @@ def forward(self,
                 router_logits: torch.Tensor,
                 is_prefill: bool,
                 enable_force_load_balance: bool = False,
-                top_k=None):
+                top_k=None,
+                **kwargs):
         assert self.quant_method is not None
 
         if top_k:
@@ -862,7 +865,11 @@ def forward(self,
             scoring_func=self.scoring_func,
             e_score_correction_bias=self.e_score_correction_bias,
             is_prefill=is_prefill,
-            enable_force_load_balance=enable_force_load_balance)
+            enable_force_load_balance=enable_force_load_balance,
+            **kwargs)
+
+        if self.enable_multistream_shared_expert and not is_prefill:
+            hidden_states, shared_output = hidden_states
 
         if self.dp_size > 1:
             if VLLM_ENABLE_MC2 and not is_prefill:
@@ -886,6 +893,8 @@ def forward(self,
         if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
             hidden_states = tensor_model_parallel_all_reduce(hidden_states)
 
+        if self.enable_multistream_shared_expert and not is_prefill:
+            return hidden_states, shared_output
         return hidden_states
 
     # ----------------------------------------- TBO-related --------------------------------------------
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py

Original file line number	Diff line number	Diff line change
`@@ -114,5 +114,6 @@ def test_ascend_config_load_error():`
`114`	`114`	`},`
`115`	`115`	`}`
`116`	`116`	`with VllmRunner("facebook/opt-125m",`
	`117`	`+ enforce_eager=False,`
`117`	`118`	`additional_config=input_additional_config_fake_2):`
`118`	`119`	`pass`