move variable to additional config

chenwaner · chenwaner · commit 8acb0f5dc17f · 2025-06-09T15:40:47.000+08:00
Signed-off-by: chenwaner &lt;861645847@qq.com&gt;
diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md
@@ -24,28 +24,32 @@ LLM(model="Qwen/Qwen3-8B", additional_config={"config_key":"config_value"})
 
 The following table lists the additional configuration options available in vLLM Ascend:
 
-| Name | Type | Default | Description |
-| ---- | ---- | ------- | ----------- |
-| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
-| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler  |
-| `expert_tensor_parallel_size` | str | `1` | Expert tensor parallel size the model to use. |
+| Name                          | Type | Default | Description                                                                               |
+| ----------------------------- | ---- | ------- | ----------------------------------------------------------------------------------------- |
+| `torchair_graph_config`       | dict | `{}`    | The config options for torchair graph mode                                                |
+| `ascend_scheduler_config`     | dict | `{}`    | The config options for ascend scheduler                                                   |
+| `expert_tensor_parallel_size` | str  | `0`     | Expert tensor parallel size the model to use.                                             |
+| `refresh`                     | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf case. |
 
 The details of each config option are as follows:
 
 **torchair_graph_config**
 
-| Name | Type | Default | Description |
-| ---- | ---- | ------- | ----------- |
-| `enabled` | bool | `False` | Whether to enable torchair graph mode |
-| `use_cached_graph` | bool | `False` | Whether to use cached graph |
-| `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
-| `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
+| Name                               | Type      | Default | Description                                                       |
+| ---------------------------------- | --------- | ------- | ----------------------------------------------------------------- |
+| `enabled`                          | bool      | `False` | Whether to enable torchair graph mode                             |
+| `enable_view_optimize`             | bool      | `True`  | Whether to enable torchair view optimization                      |
+| `use_cached_graph`                 | bool      | `False` | Whether to use cached graph                                       |
+| `graph_batch_sizes`                | list[int] | `[]`    | The batch size for torchair graph cache                           |
+| `graph_batch_sizes_init`           | bool      | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
+| `enable_multistream_shared_expert` | bool      | `False` | Whether to enable multistream shared expert                       |
+| `enable_kv_nz`                     | bool      | `False` | Whether to enable kvcache NZ layout                               |
 
 **ascend_scheduler_config**
 
-| Name | Type | Default | Description |
-| ---- | ---- | ------- | ----------- |
-| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
+| Name      | Type | Default | Description                                      |
+| --------- | ---- | ------- | ------------------------------------------------ |
+| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine |
 
 ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you  can add `chunked_prefill_enabled: true` to ascend_scheduler_config as well.
 
@@ -59,12 +63,15 @@ A full example of additional configuration is as follows:
         "enabled": true,
         "use_cached_graph": true,
         "graph_batch_sizes": [1, 2, 4, 8],
-        "graph_batch_sizes_init": true
+        "graph_batch_sizes_init": false,
+        "enable_multistream_shared_expert": false,
+        "enable_kv_nz": false
     },
     "ascend_scheduler_config": {
         "enabled": true,
         "chunked_prefill_enabled": true,
     },
-    "expert_tensor_parallel_size": 1
+    "expert_tensor_parallel_size": 1,
+    "refresh": false,
 }
 ```
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -37,7 +37,7 @@ def __init__(self, vllm_config):
             ascend_scheduler_config)
 
         self.expert_tensor_parallel_size = int(
-            additional_config.get("expert_tensor_parallel_size", 1))
+            additional_config.get("expert_tensor_parallel_size", 0))
 
 
 class TorchairGraphConfig:
@@ -55,6 +55,10 @@ def __init__(self, torchair_graph_config):
             "graph_batch_sizes_init", False)
         self.enable_multistream_shared_expert = torchair_graph_config.get(
             "enable_multistream_shared_expert", False)
+        self.enable_view_optimize = torchair_graph_config.get(
+            "enable_view_optimize", True)
+        self.enable_kv_nz = torchair_graph_config.get(
+            "enable_kv_nz", False)
 
         if not isinstance(self.graph_batch_sizes, list):
             raise TypeError("graph_batch_sizes must be list[int]")
@@ -82,8 +86,11 @@ def __init__(self, ascend_scheduler_config: dict):
 
 
 def init_ascend_config(vllm_config):
+    additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
+    refresh = additional_config.get("refresh",
+                                    False) if additional_config else False
     global _ASCEND_CONFIG
-    if _ASCEND_CONFIG is not None:
+    if _ASCEND_CONFIG is not None and not refresh:
         return _ASCEND_CONFIG
     _ASCEND_CONFIG = AscendConfig(vllm_config)
     return _ASCEND_CONFIG
@@ -106,35 +113,52 @@ def get_ascend_config():
 def check_ascend_config(vllm_config, enforce_eager):
     ascend_config = get_ascend_config()
 
-    # Both for V0 and V1 Engine, torchair_graph cannot be enabled with eager mode.
-    if ascend_config.torchair_graph_config.enabled and enforce_eager:
-        raise RuntimeError(
-            "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
-        )
-
-    # torchair_graph only work with deepseek model and mla enabled.
-    if ascend_config.torchair_graph_config.enabled:
-        if envs.VLLM_MLA_DISABLE:
-            logger.warning(
-                "Torchair graph mode is still experimental and not supported for V1 without mla currently, "
-                "it has been disabled automatically.")
-            ascend_config.ascend_scheduler_config.enabled = False
-        if vllm_config.model_config:
-            model_type = vllm_config.model_config.hf_config.model_type
-            if "deepseek" not in model_type:
-                raise NotImplementedError(
-                    "Torchair graph mode only works with deepseek model.")
-
-    # for V1 Engine, aclgraph doesn't work with deepseek model and only qwen model is well tested.
-    if envs.VLLM_USE_V1 and vllm_config.model_config is not None and not enforce_eager:
-        model_type = vllm_config.model_config.hf_config.model_type
-        if "deepseek" in model_type:
+    # for v0 engine
+    if not envs.VLLM_USE_V1:
+        if ascend_config.torchair_graph_config.enabled:
+            raise NotImplementedError(
+                "Torchair graph mode is only supported for V1 Engine.")
+        if ascend_config.ascend_scheduler_config.enabled:
             raise NotImplementedError(
-                "ACL Graph does not support deepseek. Please "
-                "try torchair graph mode to serve deepseek models on vllm-ascend."
-                " Or set `enforce_eager=True` to use eager mode.")
-        if "qwen" not in model_type:
-            logger.warning(
-                "ACL Graph is currently experimental. Please "
-                "raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
-                " if you encourage any Error")
+                "Ascend scheduler is only supported for V1 Engine.")
+    # for v1 engine
+    else:
+        # for eager mode
+        if enforce_eager:
+            # torchair_graph cannot be enabled with eager mode.
+            if ascend_config.torchair_graph_config.enabled:
+                raise RuntimeError(
+                    "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
+                )
+        # for graph mode
+        else:
+            # torchair_graph case
+            if ascend_config.torchair_graph_config.enabled:
+                # torchair_graph is not supported for V1 without mla currently.
+                if envs.VLLM_MLA_DISABLE:
+                    logger.warning(
+                        "Torchair graph mode is still experimental and not supported for V1 without mla currently, "
+                        "it has been disabled automatically.")
+                    ascend_config.torchair_graph_config.enabled = False
+                # torchair_graph is supported for deepseek model only currently.
+                if vllm_config.model_config:
+                    model_type = vllm_config.model_config.hf_config.model_type
+                    if "deepseek" not in model_type:
+                        raise NotImplementedError(
+                            "Torchair graph mode only works with deepseek model."
+                        )
+            # aclgraph case
+            else:
+                # aclgraph doesn't work with deepseek model and only qwen model is well tested.
+                if vllm_config.model_config:
+                    model_type = vllm_config.model_config.hf_config.model_type
+                    if "deepseek" in model_type:
+                        raise NotImplementedError(
+                            "ACL Graph does not support deepseek. Please "
+                            "try torchair graph mode to serve deepseek models on vllm-ascend."
+                            " Or set `enforce_eager=True` to use eager mode.")
+                    if "qwen" not in model_type:
+                        logger.warning(
+                            "ACL Graph is currently experimental. Please "
+                            "raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
+                            " if you encourage any Error")
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -13,7 +13,9 @@
 
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
-import vllm_ascend.envs as envs_ascend
+from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
+from vllm_ascend.multistream.context import get_multistream_comm_context
+from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
 
 if TYPE_CHECKING:
@@ -118,6 +120,7 @@ class AscendMLAMetadata:
 
     with_prefill_across_dp: bool = False
 
+    query_lens: Optional[list[int]] = None
     # The dimension of the attention heads
     head_dim: Optional[int] = None
     attn_mask: torch.Tensor = None
@@ -136,6 +139,17 @@ def __post_init__(self):
         #         f"Only {supported_head_sizes} are supported for head_dim,",
         #         f"received {self.head_dim}.")
 
+    def split_metadata_for_multistream(
+        self,
+        ms_split_config: MSAttentionMetadataSplitConfig,
+    ) -> list["AscendMLAMetadata"]:
+        """Split metadata for multi-stream with AscendMLAMetadata"""
+        return model_input_split_v1_mla_attn(
+            ms_split_config=ms_split_config,
+            attn_metadata=self,
+            _metadata_cls=AscendMLAMetadata,
+        )
+
 
 M = TypeVar("M", bound=AscendMLAMetadata)
 
@@ -387,6 +401,7 @@ def build(
 
         return self.metadata_cls(  # type: ignore
             num_actual_tokens=num_actual_tokens,
+            query_lens=query_lens.tolist(),
             slot_mapping=slot_mapping,
             head_dim=self.runner.model_config.get_head_size(),
             num_decodes=self._num_decodes,
@@ -444,9 +459,9 @@ def __init__(
         self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None)
         self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None)
 
-        self.enable_kv_nz = envs_ascend.VLLM_ENABLE_KV_NZ
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
 
     def _v_up_proj_and_o_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
@@ -587,7 +602,15 @@ def _forward_prefill(
             )
         attn_output = attn_output.reshape(
             [num_tokens, self.num_heads * self.v_head_dim])
-        return self.o_proj(attn_output)[0]
+
+        current_ms_metadata = get_multistream_comm_context()
+        if current_ms_metadata is None:
+            return self.o_proj(attn_output)[0]
+        else:
+            current_ms_metadata.before_comm_event.record()
+            with torch.npu.stream(current_ms_metadata.comm_stream):
+                current_ms_metadata.before_comm_event.wait()
+                return self.o_proj(attn_output)[0]
 
     def exec_kv(
         self,
@@ -731,7 +754,14 @@ def _forward_decode(
                 context_lens=attn_metadata.decode.seq_lens,  # type:ignore
                 mla_vheadsize=self.kv_lora_rank,
                 out=attn_output)
-        return self._v_up_proj_and_o_proj(attn_output)
+        current_ms_metadata = get_multistream_comm_context()
+        if current_ms_metadata is None:
+            return self._v_up_proj_and_o_proj(attn_output)
+        else:
+            current_ms_metadata.before_comm_event.record()
+            with torch.npu.stream(current_ms_metadata.comm_stream):
+                current_ms_metadata.before_comm_event.wait()
+                return self._v_up_proj_and_o_proj(attn_output)
 
     def forward(
         self,
@@ -863,16 +893,38 @@ def forward(
                 key_cache=kv_cache,
                 slot_indices=attn_metadata.slot_mapping.flatten())
         if has_prefill:
-            output[num_decode_tokens:] = self._forward_prefill(
-                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
-                attn_metadata)
+            # FIX: aicore move should be also placed on the comm stream in dbo,
+            # otherwise it may affect the accuracy
+            # TODO: use an elegant way to overlap
+            output_prefill = self._forward_prefill(prefill_q,
+                                                   prefill_k_c_normed,
+                                                   prefill_k_pe, kv_cache,
+                                                   attn_metadata)
+            current_ms_metadata = get_multistream_comm_context()
+            if current_ms_metadata is not None:
+                with torch.npu.stream(current_ms_metadata.comm_stream):
+                    output[num_decode_tokens:] = output_prefill
+                    current_ms_metadata.after_comm_event.record()
+            else:
+                output[num_decode_tokens:] = output_prefill
+
         if has_decode:
             if self.running_in_graph:
                 return self._forward_decode(decode_ql_nope, decode_q_pe,
                                             decode_k_nope, decode_k_pe,
                                             kv_cache, attn_metadata)
             else:
-                output[:num_decode_tokens] = self._forward_decode(
-                    decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
-                    kv_cache, attn_metadata)
+                output_decode = self._forward_decode(decode_ql_nope,
+                                                     decode_q_pe,
+                                                     decode_k_nope,
+                                                     decode_k_pe, kv_cache,
+                                                     attn_metadata)
+            current_ms_metadata = get_multistream_comm_context()
+            if current_ms_metadata is not None:
+                with torch.npu.stream(current_ms_metadata.comm_stream):
+                    output[:num_decode_tokens] = output_decode
+                    current_ms_metadata.after_comm_event.record()
+            else:
+                output[:num_decode_tokens] = output_decode
+
         return output_padded
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -55,9 +55,6 @@
     # Find more detail here: https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/atlas_ascendc_best_practices_10_0043.html
     "VLLM_ENABLE_MC2":
     lambda: bool(int(os.getenv("VLLM_ENABLE_MC2", '0'))),
-    # Whether to enable the kvcache nz optimization, the default value is False.
-    "VLLM_ENABLE_KV_NZ":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_KV_NZ", '0'))),
     # Whether to enable the topk optimization. It's disabled by default for experimental support
     # We'll make it enabled by default in the future.
     "VLLM_ASCEND_ENABLE_TOPK_OPTIMZE":