Skip to content

Commit b7966ea

Browse files
committed
[bugfix] add with_prefill cpu allreduce to handle D-node recomputation situations
Signed-off-by: liziyu <liziyu16@huawei.com>
1 parent bd2f365 commit b7966ea

File tree

2 files changed

+14
-19
lines changed

2 files changed

+14
-19
lines changed

vllm_ascend/models/deepseek_v2.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@
3434
from torch.nn.parameter import Parameter
3535
from transformers import PretrainedConfig
3636
from vllm.attention import Attention, AttentionMetadata
37-
from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
38-
get_current_vllm_config)
37+
from vllm.config import (CacheConfig, ModelConfig, VllmConfig)
3938
from vllm.distributed import (get_dp_group, get_pp_group,
4039
get_tensor_model_parallel_rank,
4140
get_tensor_model_parallel_world_size,
@@ -335,10 +334,6 @@ def __init__(
335334

336335
self.tp_group = get_tp_group().device_group
337336
self.tp_rank = get_tp_group().rank_in_group
338-
self.kv_consumer = None
339-
transfer_config = get_current_vllm_config().kv_transfer_config
340-
if transfer_config is not None:
341-
self.kv_consumer = transfer_config.kv_role == "kv_consumer"
342337

343338
def forward(
344339
self,
@@ -353,10 +348,6 @@ def forward(
353348
enable_force_load_balance = forward_context.in_profile_run
354349

355350
is_prefill = forward_context.with_prefill
356-
# If this node is kv_consumer, we force the moe always runs in decode path to make sure
357-
# the behaviour aligned between dummy_run and normal model_execute.
358-
if self.kv_consumer:
359-
is_prefill = False
360351

361352
# router_logits: (num_tokens, n_experts)
362353
if self.enable_multistream_moe:

vllm_ascend/worker/model_runner_v1.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -633,12 +633,19 @@ def _get_forward_metadata_across_dp(
633633
if self.is_kv_consumer and self.torchair_graph_enabled and len(
634634
self.torchair_graph_batch_sizes
635635
) == 1 and not self.in_profile_run:
636-
max_num_decode_tokens = self.torchair_graph_batch_sizes[0]
637-
num_tokens_across_dp = torch.tensor([max_num_decode_tokens] *
638-
self.dp_size,
639-
device="cpu",
640-
dtype=torch.int32)
641-
return max_num_decode_tokens, num_tokens_across_dp, False, enable_dbo
636+
with_prefill_tensor = torch.tensor([with_prefill],
637+
device="cpu",
638+
dtype=torch.bool)
639+
dist.all_reduce(with_prefill_tensor,
640+
group=get_dp_group().cpu_group,
641+
op=dist.ReduceOp.MAX)
642+
if not with_prefill_tensor.item():
643+
max_num_decode_tokens = self.torchair_graph_batch_sizes[0]
644+
num_tokens_across_dp = torch.tensor([max_num_decode_tokens] *
645+
self.dp_size,
646+
device="cpu",
647+
dtype=torch.int32)
648+
return max_num_decode_tokens, num_tokens_across_dp, False, enable_dbo
642649

643650
num_tokens_across_dp = [0] * self.dp_size * 2
644651
num_tokens_across_dp[self.dp_rank] = maybe_padded_num_tokens
@@ -1641,9 +1648,6 @@ def _dummy_run(
16411648
maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
16421649
num_tokens)
16431650

1644-
# For kv producer, with prefill always true
1645-
if self.is_kv_producer:
1646-
with_prefill = True
16471651
# Padding for DP
16481652
(num_tokens, num_tokens_across_dp, with_prefill,
16491653
enable_dbo) = self._get_forward_metadata_across_dp(

0 commit comments

Comments
 (0)