[2/N][Refactor] torchair model runner refactor

wangxiyuan · wangxiyuan · commit 896f84fc80c5 · 2025-08-05T10:54:13.000+08:00
Signed-off-by: wangxiyuan &lt;wangxiyuan1007@gmail.com&gt;
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
@@ -17,8 +17,12 @@
 # Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
 #
 
+from typing import Optional
+
 import torch
+import torch.distributed as dist
 from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_dp_group
 
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
@@ -27,3 +31,47 @@ class NPUTorchairModelRunner(NPUModelRunner):
 
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
         super().__init__(vllm_config, device)
+
+    def _get_forward_metadata_across_dp(
+        self,
+        num_tokens: int,
+        with_prefill: bool,
+        enable_dbo: bool = False,
+    ) -> tuple[int, Optional[torch.Tensor], bool, bool]:
+        if with_prefill:
+            maybe_padded_num_tokens = num_tokens
+        else:
+            maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
+                num_tokens)
+        if self.dp_size == 1:
+            return maybe_padded_num_tokens, None, with_prefill, enable_dbo
+
+        num_tokens_across_dp = [0] * self.dp_size * 2
+        num_tokens_across_dp[self.dp_rank] = maybe_padded_num_tokens
+        num_tokens_across_dp[self.dp_size + self.dp_rank] = num_tokens
+        forward_metadata = torch.tensor(num_tokens_across_dp +
+                                        [with_prefill, not enable_dbo],
+                                        device="cpu",
+                                        dtype=torch.int32)
+        dist.all_reduce(forward_metadata, group=get_dp_group().cpu_group)
+        with_prefill = bool(forward_metadata[-2])
+
+        # NOTE: when with_prefill is false before all_reduce and true after all_reduce, we need to revert pad.
+        if with_prefill:
+            num_tokens_across_dp = forward_metadata[self.dp_size:self.dp_size *
+                                                    2]
+            maybe_padded_num_tokens = num_tokens
+        else:
+            num_tokens_across_dp = forward_metadata[:self.dp_size]
+
+        # NOTE: when in torchair_graph_mode, we need to pad local_num_tokens to
+        # `max_tokens_across_dp`, in other situation it is not necessary.
+        if not with_prefill:
+            maybe_padded_num_tokens = torch.max(num_tokens_across_dp).item()
+            num_tokens_across_dp = torch.tensor([maybe_padded_num_tokens] *
+                                                self.dp_size,
+                                                device="cpu",
+                                                dtype=torch.int32)
+
+        return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, not bool(
+            forward_metadata[-1])
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -571,11 +571,14 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
     def _get_forward_metadata_across_dp(
         self,
-        maybe_padded_num_tokens: int,
         num_tokens: int,
         with_prefill: bool,
         enable_dbo: bool = False,
     ) -> tuple[int, Optional[torch.Tensor], bool, bool]:
+        maybe_padded_num_tokens = num_tokens
+        if self.torchair_graph_enabled and not with_prefill:
+            maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
+                num_tokens)
         if self.dp_size == 1:
             return maybe_padded_num_tokens, None, with_prefill, enable_dbo
 
@@ -1108,14 +1111,9 @@ def _process_reqs(
                                               attn_state,
                                               total_num_scheduled_tokens)
 
-        maybe_padded_num_tokens = total_num_scheduled_tokens
-        if self.torchair_graph_enabled and not with_prefill:
-            maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
-                total_num_scheduled_tokens)
         (padded_num_tokens_across_dp, num_tokens_across_dp, with_prefill,
          enable_dbo) = self._get_forward_metadata_across_dp(
-             maybe_padded_num_tokens, total_num_scheduled_tokens, with_prefill,
-             enable_dbo)
+             total_num_scheduled_tokens, with_prefill, enable_dbo)
         extra_builder_kwargs['enable_dbo_across_dp'] = enable_dbo
 
         if self.torchair_graph_enabled and not with_prefill:
@@ -1791,15 +1789,9 @@ def _dummy_run(
         with_prefill: bool = False,
         is_torchair_compile: bool = False,
     ) -> torch.Tensor:
-        maybe_padded_num_tokens = num_tokens
-        if self.torchair_graph_enabled and not with_prefill:
-            maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
-                num_tokens)
-
         # Padding for DP
         (num_tokens, num_tokens_across_dp, with_prefill,
-         _) = self._get_forward_metadata_across_dp(maybe_padded_num_tokens,
-                                                   num_tokens, with_prefill,
+         _) = self._get_forward_metadata_across_dp(num_tokens, with_prefill,
                                                    False)
 
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs