Perf: support PIECEWISE cuda graph for PCP

FENP · FENP · commit 1598b45d7ed1 · 2025-10-24T15:36:28.000+08:00
Signed-off-by: FENP &lt;yuanyongjie.yyj@antgroup.com&gt;
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
@@ -359,6 +359,15 @@ def __post_init__(self):
                     ):
                         self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
+                    # prefill context parallel do not support full cudagraphs now.
+                    if self.parallel_config.prefill_context_parallel_size > 1:
+                        logger.warning(
+                            "Prefill context parallel (PCP) is enabled, which is "
+                            "incompatible with full CUDA graphs. Set "
+                            "cudagraph_mode to PIECEWISE."
+                        )
+                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
                     # decode context parallel do not support full cudagraphs now.
                     if self.parallel_config.decode_context_parallel_size > 1:
                         logger.warning(
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -206,13 +206,6 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             )
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
-        if (
-            compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            and parallel_config.prefill_context_parallel_size > 1
-        ):
-            logger.info("Prefill Context Parallel: disabling cudagraphs since PCP.")
-            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
     @classmethod
     def get_current_memory_usage(
         cls, device: torch.types.Device | None = None
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -1014,24 +1014,25 @@ def forward(
 
         num_actual_tokens = attn_metadata.num_actual_tokens
 
-        key_across_cp = get_pcp_group().all_gather(key.contiguous(), dim=0)
-        value_across_cp = get_pcp_group().all_gather(value.contiguous(), dim=0)
-        if (
-            self.pcp_world_size > 1
-            and attn_metadata.pcp_allgather_restore_idx is not None
-        ):
-            # Reorder kv after cp allgather.
+        if (self.pcp_world_size > 1):
+            assert attn_metadata.pcp_allgather_restore_idx is not None
+            # NOTE(yyj): we must `slice` key and value because pcp_allgather_restore_idx
+            # ignores the padding from CUDA Graph. To be optimized for performance!
+            key_across_cp = get_pcp_group().all_gather(
+                key[:num_actual_tokens].contiguous(), dim=0
+            )
+            value_across_cp = get_pcp_group().all_gather(
+                value[:num_actual_tokens].contiguous(), dim=0
+            )
+            # Reorder kv after pcp allgather.
             # Note that there are duplicate decoding tokens,
             # but we only save the first one in kvcache.
-            key_across_cp = torch.index_select(
+            key = torch.index_select(
                 key_across_cp, 0, attn_metadata.pcp_allgather_restore_idx
             )
-            value_across_cp = torch.index_select(
+            value = torch.index_select(
                 value_across_cp, 0, attn_metadata.pcp_allgather_restore_idx
             )
-        key = key_across_cp
-        value = value_across_cp
-
         if self.kv_sharing_target_layer_name is None:
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2696,7 +2696,12 @@ def execute_model(
                 aux_hidden_states = None
 
             if self.pcp_world_size > 1:
-                hidden_states = get_pcp_group().all_gather(hidden_states, 0)
+                # NOTE we must `slice` hidden_states because pcp_allgather_restore_idx
+                # ignores the padding from CUDA Graph.
+                hidden_states = get_pcp_group().all_gather(
+                    hidden_states[:num_scheduled_tokens],
+                    0,
+                )
                 hidden_states = torch.index_select(
                     hidden_states,
                     0,