vllm-project
diff --git a/‎tests/distributed/test_context_parallel.py‎
Lines changed: 25 additions & 4 deletions b/‎tests/distributed/test_context_parallel.py‎
Lines changed: 25 additions & 4 deletions
diff --git a/‎vllm/attention/backends/abstract.py‎
Lines changed: 13 additions & 0 deletions b/‎vllm/attention/backends/abstract.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎vllm/attention/ops/common.py‎
Lines changed: 30 additions & 0 deletions b/‎vllm/attention/ops/common.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 7 additions & 1 deletion b/‎vllm/config/parallel.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎vllm/distributed/parallel_state.py‎
Lines changed: 59 additions & 5 deletions b/‎vllm/distributed/parallel_state.py‎
Lines changed: 59 additions & 5 deletions
@@ -30,13 +30,15 @@ class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
     dcp_size: int
+    pcp_size: int
     eager_mode: bool
     chunked_prefill: bool
 
 
 class CPTestOptions(NamedTuple):
     multi_node_only: bool
     load_format: str | None = None
+    attn_backend: str = "FLASH_ATTN"
 
 
 @dataclass
@@ -52,20 +54,25 @@ def detailed(
         tp_base: int = 4,
         pp_base: int = 1,
         dcp_base: int = 1,
+        pcp_base: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
         load_format: str | None = None,
+        attn_backend: str = "FLASH_ATTN",
     ):
         parallel_setups = []
         for eager_mode_val in [False]:
             for pp_multiplier in [1]:
-                for dcp_multiplier in [0.5, 1]:
+                # TODO(qcs): Test the effect of mixed activation
+                # when CP and DCP are compatible.
+                for pcp_multiplier, dcp_multiplier in zip([1, 2, 1], [0.5, 1, 1]):
                     for chunked_prefill_val in [True]:
                         parallel_setups.append(
                             ParallelSetup(
                                 tp_size=tp_base,
                                 pp_size=pp_multiplier * pp_base,
                                 dcp_size=int(dcp_multiplier * tp_base),
+                                pcp_size=int(pcp_multiplier * pcp_base),
                                 eager_mode=eager_mode_val,
                                 chunked_prefill=chunked_prefill_val,
                             )
@@ -75,7 +82,9 @@ def detailed(
             distributed_backends=["mp"],
             runner=runner,
             test_options=CPTestOptions(
-                multi_node_only=multi_node_only, load_format=load_format
+                multi_node_only=multi_node_only,
+                load_format=load_format,
+                attn_backend=attn_backend,
             ),
         )
 
@@ -108,11 +117,12 @@ def _compare_cp_with_tp(
         tp_size,
         pp_size,
         dcp_size,
+        pcp_size,
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
 
-    multi_node_only, load_format = test_options
+    multi_node_only, load_format, attn_backend = test_options
 
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_transformers_version(on_fail="skip")
@@ -155,7 +165,7 @@ def _compare_cp_with_tp(
         "--max-model-len",
         "2048",
         "--max-num-seqs",
-        "8",
+        "16",
     ]
     if chunked_prefill:
         common_args.append("--enable-chunked-prefill")
@@ -172,6 +182,10 @@ def _compare_cp_with_tp(
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
+    cp_env = tp_env = {
+        "VLLM_ATTENTION_BACKEND": attn_backend,
+    }
+
     cp_args = [
         *common_args,
         "--tensor-parallel-size",
@@ -180,6 +194,8 @@ def _compare_cp_with_tp(
         str(pp_size),
         "--decode-context-parallel-size",
         str(dcp_size),
+        "--prefill-context-parallel-size",
+        str(pcp_size),
         "--distributed-executor-backend",
         distributed_backend,
     ]
@@ -198,19 +214,24 @@ def _compare_cp_with_tp(
         model_id,
         cp_args,
         tp_args,
+        cp_env,
+        tp_env,
         method=method,
         max_wait_seconds=720,
     )
 
 
 CP_TEXT_GENERATION_MODELS = {
+    # [MLA attention only]
     "deepseek-ai/DeepSeek-V2-Lite-Chat": [
         CPTestSettings.detailed(),
         CPTestSettings.detailed(tp_base=2),
     ],
     "bigcode/gpt_bigcode-santacoder": [
         CPTestSettings.detailed(),
         CPTestSettings.detailed(tp_base=2),
+        CPTestSettings.detailed(attn_backend="FLASHINFER"),
+        CPTestSettings.detailed(tp_base=2, attn_backend="FLASHINFER"),
     ],
 }
 
 
@@ -127,6 +127,9 @@ class AttentionImpl(ABC, Generic[T]):
     dcp_world_size: int
     dcp_rank: int
 
+    pcp_world_size: int
+    pcp_rank: int
+
     def __new__(cls, *args, **kwargs):
         # use __new__ so that all subclasses will call this
         self = super().__new__(cls)
@@ -139,6 +142,16 @@ def __new__(cls, *args, **kwargs):
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
+        try:
+            from vllm.distributed.parallel_state import get_pcp_group
+
+            self.pcp_world_size = get_pcp_group().world_size
+            self.pcp_rank = get_pcp_group().rank_in_group
+        except AssertionError:
+            # PCP might not be initialized in testing
+            self.pcp_world_size = 1
+            self.pcp_rank = 0
+
         self.need_to_return_lse_for_decode = (
             self.dcp_world_size > 1 and self.can_return_lse_for_decode
         )
 
@@ -205,6 +205,36 @@ def cp_lse_ag_out_rs(
     return out
 
 
+def cp_lse_ag_out_ar(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    cp_group: GroupCoordinator,
+    ctx: CPTritonContext = None,
+):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    if cp_group.world_size == 1:
+        return cp_attn_out
+
+    if ctx is None:
+        ctx = CPTritonContext()
+
+    lses = torch.empty(
+        (cp_group.world_size,) + cp_attn_lse.shape,
+        dtype=cp_attn_lse.dtype,
+        device=cp_attn_lse.device,
+    )
+
+    cp_attn_lse = cp_attn_lse.contiguous()
+    lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
+    out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    assert out.is_contiguous()
+    out = cp_group.all_reduce(out)
+    return out
+
+
 @triton.jit
 def _pack_seq_kernel(
     x_ptr,  # [N, D]
 
@@ -71,6 +71,8 @@ class ParallelConfig:
     """Number of pipeline parallel groups."""
     tensor_parallel_size: int = 1
     """Number of tensor parallel groups."""
+    prefill_context_parallel_size: int = 1
+    """Number of prefill context parallel groups."""
     data_parallel_size: int = 1
     """Number of data parallel groups. MoE layers will be sharded according to
     the product of the tensor parallel size and data parallel size."""
@@ -467,7 +469,11 @@ def __post_init__(self) -> None:
             )
 
         # Continue with the rest of the initialization
-        self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+        self.world_size = (
+            self.pipeline_parallel_size
+            * self.tensor_parallel_size
+            * self.prefill_context_parallel_size
+        )
 
         if self.distributed_executor_backend == "external_launcher":
             logger.info("Using external launcher for distributed inference.")
 
@@ -1085,6 +1085,24 @@ def get_pp_group() -> GroupCoordinator:
     return _PP
 
 
+_PCP: GroupCoordinator | None = None
+
+
+def get_pcp_group() -> GroupCoordinator:
+    assert _PCP is not None, "prefill context parallel group is not initialized"
+    return _PCP
+
+
+def get_prefill_context_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    return get_pcp_group().world_size
+
+
+def get_prefill_context_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    return get_pcp_group().rank_in_group
+
+
 @deprecated(
     "`get_pipeline_model_parallel_group` has been replaced with "
     "`get_pp_group` and may be removed in v0.12. Please use "
@@ -1207,6 +1225,7 @@ def init_distributed_environment(
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
+    context_model_parallel_size: int = 1,
     decode_context_model_parallel_size: int | None = 1,
     backend: str | None = None,
 ) -> None:
@@ -1256,7 +1275,11 @@ def initialize_model_parallel(
     # to get group_ranks for each dimension, transpose that dimension to the
     # last dimension, then reshape to 2D, then unbind the last dimension
     all_ranks = torch.arange(world_size).reshape(
-        -1, data_parallel_size, pipeline_model_parallel_size, tensor_model_parallel_size
+        -1,
+        data_parallel_size,
+        pipeline_model_parallel_size,
+        context_model_parallel_size,
+        tensor_model_parallel_size,
     )  # noqa
 
     # Build the tensor model-parallel groups.
@@ -1295,7 +1318,7 @@ def initialize_model_parallel(
     global _PP
     assert _PP is None, "pipeline model parallel group is already initialized"
     group_ranks = (
-        all_ranks.transpose(2, 3).reshape(-1, pipeline_model_parallel_size).unbind(0)
+        all_ranks.transpose(2, 4).reshape(-1, pipeline_model_parallel_size).unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
     _PP = init_model_parallel_group(
@@ -1304,7 +1327,7 @@ def initialize_model_parallel(
 
     global _DP
     assert _DP is None, "data parallel group is already initialized"
-    group_ranks = all_ranks.transpose(1, 3).reshape(-1, data_parallel_size).unbind(0)
+    group_ranks = all_ranks.transpose(1, 4).reshape(-1, data_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
     _DP = init_model_parallel_group(
         group_ranks, get_world_group().local_rank, backend, group_name="dp"
@@ -1314,29 +1337,46 @@ def initialize_model_parallel(
     assert _EP is None, "expert parallel group is already initialized"
     group_ranks = (
         all_ranks.transpose(1, 2)
-        .reshape(-1, data_parallel_size * tensor_model_parallel_size)
+        .reshape(
+            -1,
+            data_parallel_size
+            * tensor_model_parallel_size
+            * context_model_parallel_size,
+        )
         .unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
     _EP = init_model_parallel_group(
         group_ranks, get_world_group().local_rank, backend, group_name="ep"
     )
 
+    global _PCP
+    assert _PCP is None, "prefill context parallel group is already initialized"
+    group_ranks = (
+        all_ranks.transpose(3, 4).reshape(-1, context_model_parallel_size).unbind(0)
+    )
+    group_ranks = [x.tolist() for x in group_ranks]
+    _PCP = init_model_parallel_group(
+        group_ranks, get_world_group().local_rank, backend, group_name="pcp"
+    )
+
     logger.info(
         "rank %s in world size %s is assigned as "
-        "DP rank %s, PP rank %s, TP rank %s, EP rank %s",
+        "DP rank %s, PP rank %s, TP rank %s, EP rank %s, PCP rank %s",
         rank,
         world_size,
         _DP.rank_in_group,
         _PP.rank_in_group,
         _TP.rank_in_group,
         _EP.rank_in_group,
+        _PCP.rank_in_group,
     )
 
 
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
+    prefill_context_model_parallel_size: int = 1,
     decode_context_model_parallel_size: int | None = 1,
     backend: str | None = None,
 ) -> None:
@@ -1349,6 +1389,7 @@ def ensure_model_parallel_initialized(
         initialize_model_parallel(
             tensor_model_parallel_size,
             pipeline_model_parallel_size,
+            prefill_context_model_parallel_size,
             decode_context_model_parallel_size,
             backend,
         )
@@ -1365,6 +1406,12 @@ def ensure_model_parallel_initialized(
         f"got: {pp_world_size=} vs. "
         f"wanted: {pipeline_model_parallel_size=}"
     )
+    pcp_world_size = get_pcp_group().world_size
+    assert pcp_world_size == prefill_context_model_parallel_size, (
+        "prefill context parallel group already initialized, but of unexpected size: "
+        f"{pcp_world_size=} vs. "
+        f"{prefill_context_model_parallel_size=}"
+    )
 
 
 def prepare_communication_buffer_for_model(model: torch.nn.Module):
@@ -1382,6 +1429,8 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
         _DP.prepare_communication_buffer_for_model(model)
     if _EP is not None:
         _EP.prepare_communication_buffer_for_model(model)
+    if _PCP is not None:
+        _PCP.prepare_communication_buffer_for_model(model)
 
 
 def model_parallel_is_initialized():
@@ -1471,6 +1520,11 @@ def destroy_model_parallel():
         _EP.destroy()
     _EP = None
 
+    global _PCP
+    if _PCP:
+        _PCP.destroy()
+    _PCP = None
+
 
 def destroy_distributed_environment():
     global _WORLD, _NODE_COUNT