[bugfix] add with_prefill cpu allreduce to handle D-node recomputatio… (#2129)

liziyu179 · web-flow · commit 92e6aa9f36b3 · 2025-08-01T18:29:17.000+08:00
Add with-prefill CPU AllReduce to handle D-node recomputation situations. ### What this PR does / why we need it? Add with-prefill CPU AllReduce to handle D-node recomputation situations. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? gsm8k http://image.huawei.com/tiny-lts/v1/images/mdstorm/dcbc43b858db666f185d73868f7933fb_1242x502.png livecodebench http://image.huawei.com/tiny-lts/v1/images/mdstorm/78a2e9695c3d841870d02c840f032154_1242x502.png vllmbeachmark http://image.huawei.com/tiny-lts/v1/images/mdstorm/a4d32f4f2d702cf89854b83ae4d58337_1242x502.png performance http://image.huawei.com/tiny-lts/v1/images/mdstorm/38e194a09c3c9ae902a3772f1dca6862_1609x1095.png http://image.huawei.com/tiny-lts/v1/images/mdstorm/38e194a09c3c9ae902a3772f1dca6862_1609x1095.png Signed-off-by: liziyu <liziyu16@huawei.com>
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -34,8 +34,7 @@
 from torch.nn.parameter import Parameter
 from transformers import PretrainedConfig
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
-                         get_current_vllm_config)
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_dp_group, get_pp_group,
                               get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -335,10 +334,6 @@ def __init__(
 
         self.tp_group = get_tp_group().device_group
         self.tp_rank = get_tp_group().rank_in_group
-        self.kv_consumer = None
-        transfer_config = get_current_vllm_config().kv_transfer_config
-        if transfer_config is not None:
-            self.kv_consumer = transfer_config.kv_role == "kv_consumer"
 
     def forward(
             self,
@@ -353,10 +348,6 @@ def forward(
         enable_force_load_balance = forward_context.in_profile_run
 
         is_prefill = forward_context.with_prefill
-        # If this node is kv_consumer, we force the moe always runs in decode path to make sure
-        # the behaviour aligned between dummy_run and normal model_execute.
-        if self.kv_consumer:
-            is_prefill = False
 
         # router_logits: (num_tokens, n_experts)
         if self.enable_multistream_moe:
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -636,12 +636,19 @@ def _get_forward_metadata_across_dp(
         if self.is_kv_consumer and self.torchair_graph_enabled and len(
                 self.torchair_graph_batch_sizes
         ) == 1 and not self.in_profile_run:
-            max_num_decode_tokens = self.torchair_graph_batch_sizes[0]
-            num_tokens_across_dp = torch.tensor([max_num_decode_tokens] *
-                                                self.dp_size,
-                                                device="cpu",
-                                                dtype=torch.int32)
-            return max_num_decode_tokens, num_tokens_across_dp, False, enable_dbo
+            with_prefill_tensor = torch.tensor([with_prefill],
+                                               device="cpu",
+                                               dtype=torch.bool)
+            dist.all_reduce(with_prefill_tensor,
+                            group=get_dp_group().cpu_group,
+                            op=dist.ReduceOp.MAX)
+            if not with_prefill_tensor.item():
+                max_num_decode_tokens = self.torchair_graph_batch_sizes[0]
+                num_tokens_across_dp = torch.tensor([max_num_decode_tokens] *
+                                                    self.dp_size,
+                                                    device="cpu",
+                                                    dtype=torch.int32)
+                return max_num_decode_tokens, num_tokens_across_dp, False, enable_dbo
 
         num_tokens_across_dp = [0] * self.dp_size * 2
         num_tokens_across_dp[self.dp_rank] = maybe_padded_num_tokens
@@ -1644,9 +1651,6 @@ def _dummy_run(
             maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
                 num_tokens)
 
-        # For kv producer, with prefill always true
-        if self.is_kv_producer:
-            with_prefill = True
         # Padding for DP
         (num_tokens, num_tokens_across_dp, with_prefill,
          enable_dbo) = self._get_forward_metadata_across_dp(