0xrushi
diff --git a/‎tests/compile/test_config.py‎
Lines changed: 73 additions & 0 deletions b/‎tests/compile/test_config.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎vllm/config/compilation.py‎
Lines changed: 40 additions & 38 deletions b/‎vllm/config/compilation.py‎
Lines changed: 40 additions & 38 deletions
diff --git a/‎vllm/config/scheduler.py‎
Lines changed: 0 additions & 15 deletions b/‎vllm/config/scheduler.py‎
Lines changed: 0 additions & 15 deletions
@@ -1,13 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
+from contextlib import nullcontext
 
 import pytest
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.config.compilation import CompilationMode
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
 
 
@@ -233,3 +236,73 @@ def test_resolve_operator_overload():
     assert len(resolved) == 2  # Only 2 valid ops
     assert resolved[0] is torch.ops.aten.mm.default
     assert resolved[1] is torch.ops.aten.addmm.default
+
+
+@pytest.mark.skipif(
+    not current_platform.support_static_graph_mode(),
+    reason="Skip if not cudagraph mode supported",
+)
+@pytest.mark.parametrize(
+    (
+        "cudagraph_capture_sizes",
+        "max_cudagraph_capture_size",
+        "tp_size",
+        "enable_sequence_parallelism",
+        "max_num_batched_tokens",
+        "use_cudagraph",
+        "expected_max_size",
+    ),
+    [
+        (None, None, 1, False, 2048, True, 512),
+        ([1, 2, 4], 4, 1, False, 2048, True, 4),
+        ([1, 2, 4], 8, 1, False, 2048, True, RuntimeError),
+        ([1, 256], None, 1, False, 2048, 256),
+        ([], None, 1, False, 2048, False, 0),
+        (None, 0, 1, False, 2048, False, 0),
+        # truncated to nearest multiple of 8 or 16
+        (None, 257, 1, False, 2048, True, 256),
+        ([1, 2, 4, 15], None, 1, False, 2048, True, 15),  # max from list
+        ([1, 2, 4, 15], None, 2, True, 2048, True, 4),  # filtered out 15 due to SP
+        ([1, 2, 4, 15], None, 1, False, 8, True, 4),  # limited by the max_tokens
+        # the list should contain at least 1 element when use cudagraph
+        ([], None, 1, False, 2048, True, RuntimeError),
+        # the max capturing size should be >= 1 when use cudagraph
+        (None, 0, 1, False, 2048, True, RuntimeError),
+    ],
+)
+def test_cudagraph_sizes_post_init(
+    cudagraph_capture_sizes,
+    max_cudagraph_capture_size,
+    tp_size,
+    enable_sequence_parallelism,
+    max_num_batched_tokens,
+    use_cudagraph,
+    expected_max_size,
+):
+    ctx = nullcontext()
+    if isinstance(expected_max_size, Exception):
+        ctx = pytest.raises(expected_max_size)
+
+    cudagraph_mode = CUDAGraphMode.PIECEWISE if use_cudagraph else CUDAGraphMode.NONE
+    with ctx:
+        compilation_config = CompilationConfig(
+            cudagraph_capture_sizes=cudagraph_capture_sizes,
+            max_cudagraph_capture_size=max_cudagraph_capture_size,
+            pass_config={
+                "enable_sequence_parallelism": enable_sequence_parallelism,
+                "enable_fusion": True,
+                "enable_noop": True,
+            },
+            cudagraph_mode=cudagraph_mode,
+        )
+        engine_args = EngineArgs(
+            model="facebook/opt-125m",
+            tensor_parallel_size=tp_size,
+            max_num_batched_tokens=max_num_batched_tokens,
+            compilation_config=compilation_config,
+        )
+        vllm_config = engine_args.create_engine_config()
+
+    assert (
+        vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size
+    )
@@ -154,6 +154,8 @@ class CompilationConfig:
         - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
         - [`cudagraph_capture_sizes`]
         [vllm.config.CompilationConfig.cudagraph_capture_sizes]
+        - [`max_cudagraph_capture_size`]
+        [vllm.config.CompilationConfig.max_cudagraph_capture_size]
         - [`cudagraph_num_of_warmups`]
         [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
         - [`cudagraph_copy_inputs`]
@@ -327,18 +329,16 @@ class CompilationConfig:
     more modes may be added.
     """
     use_cudagraph: bool = True
-    """Whether to use cudagraph inside compilation.
-    - False: cudagraph inside compilation is not used.
+    """Whether to use cudagraph inside compilation:
+
+    - False: cudagraph inside compilation is not used.\n
     - True: cudagraph inside compilation is used. It requires
         that all input buffers have fixed addresses, and all
         splitting ops write their outputs to input buffers.
-    In the vLLM V1 Engine, this flag only applies for
-    CompilationMode.VLLM_COMPILE (aka -O3).
-    Note that this is orthogonal to the cudagraph capture logic
-    outside of compilation.
+
     Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=PIECEWISE
-    instead.
+    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
+    _PIECEWISE instead.
     """
     cudagraph_num_of_warmups: int = 0
     """Number of warmup runs for cudagraph.
@@ -398,8 +398,22 @@ class CompilationConfig:
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
 
-    max_capture_size: int = field(default=None, init=False)  # type: ignore
-    """not configurable, computed after init"""
+    max_cudagraph_capture_size: int | None = field(default=None)
+    """The maximum cudagraph capture size.
+    
+    If cudagraph_capture_sizes is specified, this will be set to the largest 
+    size in that list (or checked for consistency if specified). If
+    cudagraph_capture_sizes is not specified, the list of sizes is generated
+    automatically following the pattern:
+
+        [1, 2, 4] + list(range(8, 256, 8)) + list(
+        range(256, max_cudagraph_capture_size + 1, 16))
+
+    If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
+    512) by default. This voids OOM in tight memory scenarios with small 
+    max_num_seqs, and prevents capture of many large graphs (>512) that would
+    greatly increase startup time with limited performance benefit.
+    """
     local_cache_dir: str = field(default=None, init=False)  # type: ignore
     """local cache dir for each rank"""
     bs_to_padded_graph_size: list[int] = field(
@@ -408,7 +422,7 @@ class CompilationConfig:
     )
     """optimization:
     Intuitively, bs_to_padded_graph_size should be dict[int, int].
-    since we know all keys are in a range [0, max_capture_size],
+    since we know all keys are in a range [0, max_cudagraph_capture_size],
     we can optimize it to list[int] for better lookup performance."""
 
     # keep track of enabled and disabled custom ops
@@ -672,25 +686,12 @@ def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
 
         return VllmBackend(vllm_config)
 
-    def init_with_cudagraph_sizes(self, cudagraph_capture_sizes: list[int]) -> None:
-        """To complete the initialization of config,
-        we need to know the cudagraph sizes."""
-
-        if self.cudagraph_capture_sizes is None:
-            self.cudagraph_capture_sizes = cudagraph_capture_sizes
-        else:
-            # de-duplicate the sizes provided by the config
-            dedup_sizes = list(set(self.cudagraph_capture_sizes))
-            if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
-                logger.info(
-                    (
-                        "cudagraph sizes specified by model runner"
-                        " %s is overridden by config %s"
-                    ),
-                    cudagraph_capture_sizes,
-                    dedup_sizes,
-                )
-            self.cudagraph_capture_sizes = dedup_sizes
+    def post_init_cudagraph_sizes(self) -> None:
+        """To complete the initialization after cudagraph related
+        configs are set. This includes:
+        - initialize compile_sizes
+        - pre-compute the mapping bs_to_padded_graph_size
+        """
 
         computed_compile_sizes = []
         if self.compile_sizes is not None:
@@ -708,23 +709,24 @@ def init_with_cudagraph_sizes(self, cudagraph_capture_sizes: list[int]) -> None:
                     computed_compile_sizes.append(x)
         self.compile_sizes = computed_compile_sizes  # type: ignore
 
-        # sort to make sure cudagraph capture sizes are in descending order
-        self.cudagraph_capture_sizes.sort(reverse=True)
-        self.max_capture_size = (
-            self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0
-        )
+        # make sure the sizes are in ascending order
+        self.cudagraph_capture_sizes.sort()
+        if self.cudagraph_capture_sizes:
+            assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
 
         # pre-compute the mapping from batch size to padded graph size
-        self.bs_to_padded_graph_size = [0 for i in range(self.max_capture_size + 1)]
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_cudagraph_capture_size + 1)
+        ]
         for end, start in zip(
-            self.cudagraph_capture_sizes, self.cudagraph_capture_sizes[1:] + [0]
+            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
+            [0] + self.cudagraph_capture_sizes,
         ):
             for bs in range(start, end):
                 if bs == start:
                     self.bs_to_padded_graph_size[bs] = start
                 else:
                     self.bs_to_padded_graph_size[bs] = end
-        self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size
 
     def set_splitting_ops_for_v1(self):
         # NOTE: this function needs to be called only when mode is
 
@@ -71,14 +71,6 @@ class SchedulerConfig:
     NOTE: This will be replaced by speculative config in the future; it is
     present to enable correctness tests until then."""
 
-    cuda_graph_sizes: list[int] = field(default_factory=list)
-    """Cuda graph capture sizes
-    1. if none provided, then default set to [min(max_num_seqs * 2, 512)]
-    2. if one value is provided, then the capture list would follow the
-    pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
-    3. more than one value (e.g. 1 2 128) is provided, then the capture list
-    will follow the provided list."""
-
     enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
     """If True, prefill requests can be chunked based
     on the remaining max_num_batched_tokens."""
@@ -235,13 +227,6 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.long_prefill_token_threshold,
             )
 
-        # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)].
-        # This avoids OOM in tight memory scenarios with small max_num_seqs,
-        # and prevents capture of many large graphs (>512) that would greatly
-        # increase startup time with limited performance benefit.
-        if not self.cuda_graph_sizes:
-            self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
-
         if self.async_scheduling:
             self.scheduler_cls = "vllm.v1.core.sched.async_scheduler.AsyncScheduler"