fix graph mode pd stuck

ningbenzhe1 · ningbenzhe1 · commit a0af0617ec19 · 2025-05-28T19:18:39.000+08:00
Signed-off-by: ningbenzhe1 &lt;ningbenzhe@huawei.com&gt;
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -130,6 +130,9 @@ class AscendMetadata:
     attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill
     attn_mask: Optional[torch.Tensor] = None
 
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
 
 class AscendAttentionMetadataBuilder:
 
diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py
@@ -21,12 +21,18 @@ def get_etp_group() -> GroupCoordinator:
     return _ETP
 
 
+def model_parallel_initialized():
+    return (_ETP is not None and _EP is not None)
+
+
 def init_ascend_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
     expert_tensor_parallel_size: int = 1,
     backend: Optional[str] = None,
 ):
+    if model_parallel_initialized():
+        return
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
     backend = backend or torch.distributed.get_backend(
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -74,8 +74,7 @@ def fused_experts_with_mc2(
     local_rank = torch.distributed.get_rank(group=ep_group)
     all_to_all_group_size = torch.distributed.get_world_size(ep_group)
 
-    world_szie = torch.distributed.get_world_size()
-    tp_size = world_szie // all_to_all_group_size
+    tp_size = get_etp_group().world_size
     tp_rank = rank % tp_size
 
     stage1_kwargs = {
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
@@ -77,13 +77,14 @@
 #       on multi-node dp inference implementation
 #   4. `ParallelConfig.stateless_init_dp_group`
 #    Why:
-#       vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to
-#       get better performance
+#       vLLM use gloo backend by default to initialize stateless dp process group, but we want to use hccl here to
+#       get better performance. Initialize the global variable of dp_group to prefill dummy_run.
 #    How：
-#       adopt nccl backend to init process group
+#       adopt nccl backend to init process group and add the global variable of dp_group.
 #    Related PR (if no, explain why): no related PR, we want add this ability into vllm
 #    Future Plan:
 #       Remove those patch when vllm merged them
+#       Add the global variable of dp_group in platform when vllm merged them.
 #
 #
 # * Worker Patch:
diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
@@ -20,13 +20,16 @@
 import torch
 import vllm
 import vllm.distributed
+import vllm.envs as envs
 from torch.distributed import ProcessGroup
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                 _get_default_timeout,
                                                 is_nccl_available)
 from torch.distributed.rendezvous import rendezvous
 from vllm.config import ParallelConfig
 
+_DP_GROUP = None
+
 
 def ascend_destroy_model_parallel():
     """Set the groups to none and destroy them."""
@@ -164,10 +167,9 @@ def parallel_config_get_dp_port(self) -> int:
     """
     answer = self.data_parallel_master_port
     self.data_parallel_master_port += 1
-    import os
 
     # NOTE: Get port from envs directly when using torchrun
-    port = int(os.environ.get("MASTER_PORT", answer))  # type: ignore
+    port = envs.VLLM_DP_MASTER_PORT if envs.VLLM_DP_MASTER_PORT else answer
     return port
 
 
@@ -183,10 +185,16 @@ def ascend_stateless_init_dp_group(self) -> "ProcessGroup":
         self.data_parallel_rank,
         self.data_parallel_size,
         backend="gloo")
+    global _DP_GROUP
+    _DP_GROUP = dp_group
 
     return dp_group
 
 
+def get_dp_group():
+    return _DP_GROUP
+
+
 vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
 ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
 ParallelConfig.stateless_init_dp_group = ascend_stateless_init_dp_group
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -29,6 +29,7 @@
 import numpy.typing as npt
 import torch
 import torch.nn as nn
+from torch.distributed import ReduceOp
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.layer import Attention
 from vllm.config import CompilationLevel, VllmConfig
@@ -59,6 +60,8 @@
 
 from vllm_ascend.attention.attention import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.patch.platform.patch_common.patch_distributed import \
+    get_dp_group
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
 from vllm_ascend.utils import vllm_version_is
@@ -328,6 +331,8 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
                 False) and self.vllm_config.model_config.use_mla
             self.use_cached_npu_graph = additional_config.get(
                 "use_cached_npu_graph", False)
+        self.has_prefilled = False
+        self.dp_group = get_dp_group()
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         """Update the cached states and the persistent batch with the scheduler
@@ -635,6 +640,22 @@ def _process_reqs(
                                   device=input_ids.device)
             input_ids = torch.cat([input_ids, padding])
             positions = torch.cat([positions, padding])
+        if self.has_prefilled and not attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+            self.has_prefilled = False
+        if not self.has_prefilled and self.enable_torchair_graph_mode:
+            self.has_prefilled = self.has_prefilled_all_rank(
+                attn_metadata.attn_state == AscendAttentionState.DecodeOnly)
+
+        if self.dp_group:
+            while not self.has_prefilled and self.enable_torchair_graph_mode and attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+                self._dummy_run(1)
+                tensor = torch.tensor([1], dtype=torch.int32, device="cpu")
+                torch.distributed.all_reduce(tensor,
+                                             op=ReduceOp.MAX,
+                                             group=self.dp_group)
+                self.has_prefilled = self.has_prefilled_all_rank(
+                    attn_metadata.attn_state ==
+                    AscendAttentionState.DecodeOnly)
 
         # Run forward pass
         with set_forward_context(attn_metadata,
@@ -644,7 +665,7 @@ def _process_reqs(
             if self.enable_torchair_graph_mode:
                 model_kwargs["kv_caches"] = self.kv_caches
                 model_kwargs["attn_metadata"] = attn_metadata
-            if self.enable_torchair_graph_mode and attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+            if self.enable_torchair_graph_mode and attn_metadata.attn_state == AscendAttentionState.DecodeOnly and self.has_prefilled:
                 torch._dynamo.mark_static(input_ids)
                 torch._dynamo.mark_static(positions)
                 torch._dynamo.mark_static(attn_metadata.decode.block_table)
@@ -772,6 +793,15 @@ def _calc_spec_decode_metadata(
         )
         return metadata
 
+    def has_prefilled_all_rank(self, has_prefilled: bool) -> bool:
+        tensor = torch.tensor([has_prefilled], dtype=torch.int32, device="cpu")
+        if self.dp_group:
+            torch.distributed.all_reduce(tensor,
+                                         op=ReduceOp.MIN,
+                                         group=self.dp_group)
+        aggregated_has_prefilled = bool(tensor.item())
+        return aggregated_has_prefilled
+
     def apply_grammar_bitmask(
         self,
         scheduler_output: "SchedulerOutput",
@@ -1039,7 +1069,11 @@ def _profile_multimodal(self) -> None:
         self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
 
     @torch.inference_mode()
-    def _dummy_run(self, num_tokens: int) -> torch.Tensor:
+    def _dummy_run(
+        self,
+        num_tokens: int,
+        attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill
+    ) -> torch.Tensor:
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
         # has num_tokens in total.
@@ -1083,11 +1117,34 @@ def _dummy_run(self, num_tokens: int) -> torch.Tensor:
                 })
 
             with set_forward_context(None, self.vllm_config):
-                hidden_states = model(
-                    input_ids=input_ids,
-                    positions=positions,
-                    intermediate_tensors=intermediate_tensors,
-                    inputs_embeds=inputs_embeds)
+                if self.enable_torchair_graph_mode and attn_state == AscendAttentionState.DecodeOnly:
+                    attn_metadata = self.attn_metadata_builder.dummy_build(
+                        num_reqs=num_tokens, num_actual_tokens=1)
+                    torch._dynamo.mark_static(input_ids)
+                    torch._dynamo.mark_static(positions)
+                    torch._dynamo.mark_static(attn_metadata.decode.block_table)
+                    torch._dynamo.mark_static(
+                        attn_metadata.decode.input_positions)
+                    torch._dynamo.mark_static(attn_metadata.slot_mapping)
+                    for kv in self.kv_caches:
+                        assert isinstance(kv,
+                                          tuple), "kv_cache must be a tuple"
+                        torch._dynamo.mark_static(kv[0])
+                        torch._dynamo.mark_static(kv[1])
+                    hidden_states = self.compile_model(
+                        input_ids=input_ids,
+                        positions=positions,
+                        intermediate_tensors=intermediate_tensors,
+                        inputs_embeds=None,
+                        kv_caches=self.kv_caches,
+                        attn_metadata=attn_metadata,
+                    )
+                else:
+                    hidden_states = model(
+                        input_ids=input_ids,
+                        positions=positions,
+                        intermediate_tensors=intermediate_tensors,
+                        inputs_embeds=inputs_embeds)
             return hidden_states
 
     def profile_run(self) -> None:
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -173,7 +173,7 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
     ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
-        return output if self.rank == 0 else None
+        return output if self.is_driver_worker else None
 
     def load_model(self) -> None:
         self.model_runner.load_model()