Skip to content

Commit 8acb0f5

Browse files
committed
move variable to additional config
Signed-off-by: chenwaner <861645847@qq.com>
1 parent 8740191 commit 8acb0f5

File tree

4 files changed

+142
-62
lines changed

4 files changed

+142
-62
lines changed

docs/source/user_guide/additional_config.md

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,28 +24,32 @@ LLM(model="Qwen/Qwen3-8B", additional_config={"config_key":"config_value"})
2424

2525
The following table lists the additional configuration options available in vLLM Ascend:
2626

27-
| Name | Type | Default | Description |
28-
| ---- | ---- | ------- | ----------- |
29-
| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
30-
| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
31-
| `expert_tensor_parallel_size` | str | `1` | Expert tensor parallel size the model to use. |
27+
| Name | Type | Default | Description |
28+
| ----------------------------- | ---- | ------- | ----------------------------------------------------------------------------------------- |
29+
| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
30+
| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
31+
| `expert_tensor_parallel_size` | str | `0` | Expert tensor parallel size the model to use. |
32+
| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf case. |
3233

3334
The details of each config option are as follows:
3435

3536
**torchair_graph_config**
3637

37-
| Name | Type | Default | Description |
38-
| ---- | ---- | ------- | ----------- |
39-
| `enabled` | bool | `False` | Whether to enable torchair graph mode |
40-
| `use_cached_graph` | bool | `False` | Whether to use cached graph |
41-
| `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
42-
| `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
38+
| Name | Type | Default | Description |
39+
| ---------------------------------- | --------- | ------- | ----------------------------------------------------------------- |
40+
| `enabled` | bool | `False` | Whether to enable torchair graph mode |
41+
| `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization |
42+
| `use_cached_graph` | bool | `False` | Whether to use cached graph |
43+
| `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
44+
| `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
45+
| `enable_multistream_shared_expert` | bool | `False` | Whether to enable multistream shared expert |
46+
| `enable_kv_nz` | bool | `False` | Whether to enable kvcache NZ layout |
4347

4448
**ascend_scheduler_config**
4549

46-
| Name | Type | Default | Description |
47-
| ---- | ---- | ------- | ----------- |
48-
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
50+
| Name | Type | Default | Description |
51+
| --------- | ---- | ------- | ------------------------------------------------ |
52+
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine |
4953

5054
ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `chunked_prefill_enabled: true` to ascend_scheduler_config as well.
5155

@@ -59,12 +63,15 @@ A full example of additional configuration is as follows:
5963
"enabled": true,
6064
"use_cached_graph": true,
6165
"graph_batch_sizes": [1, 2, 4, 8],
62-
"graph_batch_sizes_init": true
66+
"graph_batch_sizes_init": false,
67+
"enable_multistream_shared_expert": false,
68+
"enable_kv_nz": false
6369
},
6470
"ascend_scheduler_config": {
6571
"enabled": true,
6672
"chunked_prefill_enabled": true,
6773
},
68-
"expert_tensor_parallel_size": 1
74+
"expert_tensor_parallel_size": 1,
75+
"refresh": false,
6976
}
7077
```

vllm_ascend/ascend_config.py

Lines changed: 57 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(self, vllm_config):
3737
ascend_scheduler_config)
3838

3939
self.expert_tensor_parallel_size = int(
40-
additional_config.get("expert_tensor_parallel_size", 1))
40+
additional_config.get("expert_tensor_parallel_size", 0))
4141

4242

4343
class TorchairGraphConfig:
@@ -55,6 +55,10 @@ def __init__(self, torchair_graph_config):
5555
"graph_batch_sizes_init", False)
5656
self.enable_multistream_shared_expert = torchair_graph_config.get(
5757
"enable_multistream_shared_expert", False)
58+
self.enable_view_optimize = torchair_graph_config.get(
59+
"enable_view_optimize", True)
60+
self.enable_kv_nz = torchair_graph_config.get(
61+
"enable_kv_nz", False)
5862

5963
if not isinstance(self.graph_batch_sizes, list):
6064
raise TypeError("graph_batch_sizes must be list[int]")
@@ -82,8 +86,11 @@ def __init__(self, ascend_scheduler_config: dict):
8286

8387

8488
def init_ascend_config(vllm_config):
89+
additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
90+
refresh = additional_config.get("refresh",
91+
False) if additional_config else False
8592
global _ASCEND_CONFIG
86-
if _ASCEND_CONFIG is not None:
93+
if _ASCEND_CONFIG is not None and not refresh:
8794
return _ASCEND_CONFIG
8895
_ASCEND_CONFIG = AscendConfig(vllm_config)
8996
return _ASCEND_CONFIG
@@ -106,35 +113,52 @@ def get_ascend_config():
106113
def check_ascend_config(vllm_config, enforce_eager):
107114
ascend_config = get_ascend_config()
108115

109-
# Both for V0 and V1 Engine, torchair_graph cannot be enabled with eager mode.
110-
if ascend_config.torchair_graph_config.enabled and enforce_eager:
111-
raise RuntimeError(
112-
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
113-
)
114-
115-
# torchair_graph only work with deepseek model and mla enabled.
116-
if ascend_config.torchair_graph_config.enabled:
117-
if envs.VLLM_MLA_DISABLE:
118-
logger.warning(
119-
"Torchair graph mode is still experimental and not supported for V1 without mla currently, "
120-
"it has been disabled automatically.")
121-
ascend_config.ascend_scheduler_config.enabled = False
122-
if vllm_config.model_config:
123-
model_type = vllm_config.model_config.hf_config.model_type
124-
if "deepseek" not in model_type:
125-
raise NotImplementedError(
126-
"Torchair graph mode only works with deepseek model.")
127-
128-
# for V1 Engine, aclgraph doesn't work with deepseek model and only qwen model is well tested.
129-
if envs.VLLM_USE_V1 and vllm_config.model_config is not None and not enforce_eager:
130-
model_type = vllm_config.model_config.hf_config.model_type
131-
if "deepseek" in model_type:
116+
# for v0 engine
117+
if not envs.VLLM_USE_V1:
118+
if ascend_config.torchair_graph_config.enabled:
119+
raise NotImplementedError(
120+
"Torchair graph mode is only supported for V1 Engine.")
121+
if ascend_config.ascend_scheduler_config.enabled:
132122
raise NotImplementedError(
133-
"ACL Graph does not support deepseek. Please "
134-
"try torchair graph mode to serve deepseek models on vllm-ascend."
135-
" Or set `enforce_eager=True` to use eager mode.")
136-
if "qwen" not in model_type:
137-
logger.warning(
138-
"ACL Graph is currently experimental. Please "
139-
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
140-
" if you encourage any Error")
123+
"Ascend scheduler is only supported for V1 Engine.")
124+
# for v1 engine
125+
else:
126+
# for eager mode
127+
if enforce_eager:
128+
# torchair_graph cannot be enabled with eager mode.
129+
if ascend_config.torchair_graph_config.enabled:
130+
raise RuntimeError(
131+
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
132+
)
133+
# for graph mode
134+
else:
135+
# torchair_graph case
136+
if ascend_config.torchair_graph_config.enabled:
137+
# torchair_graph is not supported for V1 without mla currently.
138+
if envs.VLLM_MLA_DISABLE:
139+
logger.warning(
140+
"Torchair graph mode is still experimental and not supported for V1 without mla currently, "
141+
"it has been disabled automatically.")
142+
ascend_config.torchair_graph_config.enabled = False
143+
# torchair_graph is supported for deepseek model only currently.
144+
if vllm_config.model_config:
145+
model_type = vllm_config.model_config.hf_config.model_type
146+
if "deepseek" not in model_type:
147+
raise NotImplementedError(
148+
"Torchair graph mode only works with deepseek model."
149+
)
150+
# aclgraph case
151+
else:
152+
# aclgraph doesn't work with deepseek model and only qwen model is well tested.
153+
if vllm_config.model_config:
154+
model_type = vllm_config.model_config.hf_config.model_type
155+
if "deepseek" in model_type:
156+
raise NotImplementedError(
157+
"ACL Graph does not support deepseek. Please "
158+
"try torchair graph mode to serve deepseek models on vllm-ascend."
159+
" Or set `enforce_eager=True` to use eager mode.")
160+
if "qwen" not in model_type:
161+
logger.warning(
162+
"ACL Graph is currently experimental. Please "
163+
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
164+
" if you encourage any Error")

vllm_ascend/attention/mla_v1.py

Lines changed: 62 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313

1414
from vllm_ascend.ascend_config import get_ascend_config
1515
from vllm_ascend.attention.attention_v1 import AscendAttentionState
16-
import vllm_ascend.envs as envs_ascend
16+
from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
17+
from vllm_ascend.multistream.context import get_multistream_comm_context
18+
from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
1719
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
1820

1921
if TYPE_CHECKING:
@@ -118,6 +120,7 @@ class AscendMLAMetadata:
118120

119121
with_prefill_across_dp: bool = False
120122

123+
query_lens: Optional[list[int]] = None
121124
# The dimension of the attention heads
122125
head_dim: Optional[int] = None
123126
attn_mask: torch.Tensor = None
@@ -136,6 +139,17 @@ def __post_init__(self):
136139
# f"Only {supported_head_sizes} are supported for head_dim,",
137140
# f"received {self.head_dim}.")
138141

142+
def split_metadata_for_multistream(
143+
self,
144+
ms_split_config: MSAttentionMetadataSplitConfig,
145+
) -> list["AscendMLAMetadata"]:
146+
"""Split metadata for multi-stream with AscendMLAMetadata"""
147+
return model_input_split_v1_mla_attn(
148+
ms_split_config=ms_split_config,
149+
attn_metadata=self,
150+
_metadata_cls=AscendMLAMetadata,
151+
)
152+
139153

140154
M = TypeVar("M", bound=AscendMLAMetadata)
141155

@@ -387,6 +401,7 @@ def build(
387401

388402
return self.metadata_cls( # type: ignore
389403
num_actual_tokens=num_actual_tokens,
404+
query_lens=query_lens.tolist(),
390405
slot_mapping=slot_mapping,
391406
head_dim=self.runner.model_config.get_head_size(),
392407
num_decodes=self._num_decodes,
@@ -444,9 +459,9 @@ def __init__(
444459
self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None)
445460
self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None)
446461

447-
self.enable_kv_nz = envs_ascend.VLLM_ENABLE_KV_NZ
448462
ascend_config = get_ascend_config()
449463
self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
464+
self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
450465

451466
def _v_up_proj_and_o_proj(self, x):
452467
# Convert from (B, N, L) to (N, B, L)
@@ -587,7 +602,15 @@ def _forward_prefill(
587602
)
588603
attn_output = attn_output.reshape(
589604
[num_tokens, self.num_heads * self.v_head_dim])
590-
return self.o_proj(attn_output)[0]
605+
606+
current_ms_metadata = get_multistream_comm_context()
607+
if current_ms_metadata is None:
608+
return self.o_proj(attn_output)[0]
609+
else:
610+
current_ms_metadata.before_comm_event.record()
611+
with torch.npu.stream(current_ms_metadata.comm_stream):
612+
current_ms_metadata.before_comm_event.wait()
613+
return self.o_proj(attn_output)[0]
591614

592615
def exec_kv(
593616
self,
@@ -731,7 +754,14 @@ def _forward_decode(
731754
context_lens=attn_metadata.decode.seq_lens, # type:ignore
732755
mla_vheadsize=self.kv_lora_rank,
733756
out=attn_output)
734-
return self._v_up_proj_and_o_proj(attn_output)
757+
current_ms_metadata = get_multistream_comm_context()
758+
if current_ms_metadata is None:
759+
return self._v_up_proj_and_o_proj(attn_output)
760+
else:
761+
current_ms_metadata.before_comm_event.record()
762+
with torch.npu.stream(current_ms_metadata.comm_stream):
763+
current_ms_metadata.before_comm_event.wait()
764+
return self._v_up_proj_and_o_proj(attn_output)
735765

736766
def forward(
737767
self,
@@ -863,16 +893,38 @@ def forward(
863893
key_cache=kv_cache,
864894
slot_indices=attn_metadata.slot_mapping.flatten())
865895
if has_prefill:
866-
output[num_decode_tokens:] = self._forward_prefill(
867-
prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
868-
attn_metadata)
896+
# FIX: aicore move should be also placed on the comm stream in dbo,
897+
# otherwise it may affect the accuracy
898+
# TODO: use an elegant way to overlap
899+
output_prefill = self._forward_prefill(prefill_q,
900+
prefill_k_c_normed,
901+
prefill_k_pe, kv_cache,
902+
attn_metadata)
903+
current_ms_metadata = get_multistream_comm_context()
904+
if current_ms_metadata is not None:
905+
with torch.npu.stream(current_ms_metadata.comm_stream):
906+
output[num_decode_tokens:] = output_prefill
907+
current_ms_metadata.after_comm_event.record()
908+
else:
909+
output[num_decode_tokens:] = output_prefill
910+
869911
if has_decode:
870912
if self.running_in_graph:
871913
return self._forward_decode(decode_ql_nope, decode_q_pe,
872914
decode_k_nope, decode_k_pe,
873915
kv_cache, attn_metadata)
874916
else:
875-
output[:num_decode_tokens] = self._forward_decode(
876-
decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
877-
kv_cache, attn_metadata)
917+
output_decode = self._forward_decode(decode_ql_nope,
918+
decode_q_pe,
919+
decode_k_nope,
920+
decode_k_pe, kv_cache,
921+
attn_metadata)
922+
current_ms_metadata = get_multistream_comm_context()
923+
if current_ms_metadata is not None:
924+
with torch.npu.stream(current_ms_metadata.comm_stream):
925+
output[:num_decode_tokens] = output_decode
926+
current_ms_metadata.after_comm_event.record()
927+
else:
928+
output[:num_decode_tokens] = output_decode
929+
878930
return output_padded

vllm_ascend/envs.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,6 @@
5555
# Find more detail here: https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/atlas_ascendc_best_practices_10_0043.html
5656
"VLLM_ENABLE_MC2":
5757
lambda: bool(int(os.getenv("VLLM_ENABLE_MC2", '0'))),
58-
# Whether to enable the kvcache nz optimization, the default value is False.
59-
"VLLM_ENABLE_KV_NZ":
60-
lambda: bool(int(os.getenv("VLLM_ENABLE_KV_NZ", '0'))),
6158
# Whether to enable the topk optimization. It's disabled by default for experimental support
6259
# We'll make it enabled by default in the future.
6360
"VLLM_ASCEND_ENABLE_TOPK_OPTIMZE":

0 commit comments

Comments
 (0)