vllm-project
diff --git a/‎docs/configuration/conserving_memory.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/configuration/conserving_memory.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/design/cuda_graphs.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/design/cuda_graphs.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/offline_inference/data_parallel.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/offline_inference/data_parallel.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/compile/piecewise/test_multiple_graphs.py‎
Lines changed: 5 additions & 5 deletions b/‎tests/compile/piecewise/test_multiple_graphs.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tests/compile/piecewise/test_simple.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/compile/piecewise/test_simple.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/compile/piecewise/test_toy_llama.py‎
Lines changed: 5 additions & 5 deletions b/‎tests/compile/piecewise/test_toy_llama.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tests/compile/test_aot_compile.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/compile/test_aot_compile.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/compile/test_async_tp.py‎
Lines changed: 2 additions & 1 deletion b/‎tests/compile/test_async_tp.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/compile/test_basic_correctness.py‎
Lines changed: 14 additions & 16 deletions b/‎tests/compile/test_basic_correctness.py‎
Lines changed: 14 additions & 16 deletions
diff --git a/‎tests/compile/test_config.py‎
Lines changed: 10 additions & 10 deletions b/‎tests/compile/test_config.py‎
Lines changed: 10 additions & 10 deletions
@@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
 
     ```python
     from vllm import LLM
-    from vllm.config import CompilationConfig, CompilationLevel
+    from vllm.config import CompilationConfig, CompilationMode
 
     llm = LLM(
         model="meta-llama/Llama-3.1-8B-Instruct",
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             # By default, it goes up to max_num_seqs
             cudagraph_capture_sizes=[1, 2, 4, 8, 16],
         ),
 
@@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
     """NO CUDA Graphs support"""
 ```
 
-Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
+Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
 
 The following table lists backends that support full CUDA Graphs at the time of writing.
 
@@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
 import vllm
 from vllm.config import CUDAGraphMode
 
-compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
+compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
 model = vllm.LLM(
     model="meta-llama/Llama-3.1-8B-Instruct",
     dtype="auto",
 
@@ -95,7 +95,7 @@ def parse_args():
     parser.add_argument(
         "--compilation-config",
         type=int,
-        help=("Compilation optimization (O) level 0-3."),
+        help=("Compilation optimization (O) mode 0-3."),
     )
     parser.add_argument(
         "--quantization",
 
@@ -14,7 +14,7 @@
 from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     VllmConfig,
     set_current_vllm_config,
@@ -199,10 +199,10 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
 
     outputs = []
 
-    # piecewise compile
+    # vllmcompile compile
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
@@ -251,7 +251,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
     # no compile or cudagraph
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.NO_COMPILATION,
+            mode=CompilationMode.NONE,
         )
     )
     cudagraph_runtime_mode = CUDAGraphMode.NONE
@@ -280,7 +280,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
     # piecewise compile without CUDA graph
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=False,
             splitting_ops=["silly::attention"],
             use_inductor_graph_partition=use_inductor_graph_partition,
 
@@ -13,7 +13,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     VllmConfig,
     set_current_vllm_config,
@@ -61,7 +61,7 @@ def _run_simple_model(
 ):
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             use_inductor=use_inductor,
             splitting_ops=splitting_ops,
 
@@ -21,7 +21,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     VllmConfig,
     set_current_vllm_config,
@@ -356,13 +356,13 @@ def test_toy_llama(
     )
 
     compile_config_no_compile = CompilationConfig(
-        level=CompilationLevel.NO_COMPILATION,
+        level=CompilationMode.NONE,
         cudagraph_mode=CUDAGraphMode.NONE,
         backend="eager",
     )
 
     compile_config_no_split = CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
+        level=CompilationMode.VLLM_COMPILE,
         use_inductor_graph_partition=use_inductor_graph_partition,
         cudagraph_mode=CUDAGraphMode.PIECEWISE,
         backend=backend,
@@ -458,14 +458,14 @@ def benchmark():
     for piecewise in [False, True]:
         if piecewise:
             compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                 use_cudagraph=True,
                 splitting_ops=["silly::attention"],
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
         else:
             compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
 
 
@@ -10,7 +10,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -38,7 +38,7 @@ def forward(self, x: torch.Tensor):
 def make_vllm_config() -> VllmConfig:
     return VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
         )
     )
 
 
@@ -10,6 +10,7 @@
 from vllm.compilation.collective_fusion import AsyncTPPass
 from vllm.config import (
     CompilationConfig,
+    CompilationMode,
     DeviceConfig,
     ModelConfig,
     PassConfig,
@@ -400,7 +401,7 @@ def test_async_tp_pass_correctness(
         common_args.append("--enforce-eager")
 
     compilation_config = {
-        "level": 3,
+        "mode": CompilationMode.VLLM_COMPILE,
         "compile_sizes": [2, 4, 8],
         "splitting_ops": [],
         "pass_config": {"enable_async_tp": async_tp_enabled},
 
@@ -4,7 +4,7 @@
 
 import pytest
 
-from vllm.config import CompilationLevel
+from vllm.config import CompilationMode
 from vllm.utils import cuda_device_count_stateless
 
 from ..utils import compare_all_settings
@@ -21,7 +21,7 @@ class TestSetting:
 
 
 # we cannot afford testing the full Cartesian product
-# of all models and all levels
+# of all models and all modes
 @pytest.mark.parametrize(
     "test_setting",
     [
@@ -121,15 +121,13 @@ def test_compile_correctness(
         all_args: list[list[str]] = []
         all_envs: list[dict[str, str] | None] = []
 
-        for comp_level in [
-            CompilationLevel.DYNAMO_AS_IS,
-            CompilationLevel.DYNAMO_ONCE,
-            CompilationLevel.PIECEWISE,
+        for comp_mode in [
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+            CompilationMode.VLLM_COMPILE,
         ]:
-            for level in [CompilationLevel.NO_COMPILATION, comp_level]:
-                all_args.append(
-                    final_args + [f"-O.level={level}", "-O.backend=inductor"]
-                )
+            for mode in [CompilationMode.NONE, comp_mode]:
+                all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
 
             # inductor will change the output, so we only compare if the output
             # is close, not exactly the same.
@@ -142,13 +140,13 @@ def test_compile_correctness(
             all_envs.clear()
             all_args.clear()
 
-        for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.DYNAMO_AS_IS,
-            CompilationLevel.DYNAMO_ONCE,
-            CompilationLevel.PIECEWISE,
+        for mode in [
+            CompilationMode.NONE,
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+            CompilationMode.VLLM_COMPILE,
         ]:
-            all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
+            all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
             all_envs.append({})
             all_envs.append({})
 
 
@@ -4,7 +4,7 @@
 
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
-from vllm.config.compilation import CompilationLevel
+from vllm.config.compilation import CompilationMode
 from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
 
 
@@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
-def test_dynamo_as_is(vllm_runner, monkeypatch):
+def test_stock_torch_compile(vllm_runner, monkeypatch):
     # Disable multiprocessing so that the counter is in the same process
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     with (
-        compilation_counter.expect(dynamo_as_is_count=1),
+        compilation_counter.expect(stock_torch_compile_count=1),
         # loading the model causes compilation (if enabled) to happen
         vllm_runner(
             "facebook/opt-125m",
-            compilation_config={"level": 1},
+            compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
             gpu_memory_utilization=0.4,
         ) as _,
     ):
@@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch):
     # Disable multiprocessing so that the counter is in the same process
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
     with (
-        compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
+        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
         # loading the model causes compilation (if enabled) to happen
         vllm_runner(
             "facebook/opt-125m",
-            compilation_config={"level": 0},
+            compilation_config={"mode": CompilationMode.NONE},
             gpu_memory_utilization=0.4,
         ) as _,
     ):
@@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch):
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     with (
-        compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
+        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
         # loading the model causes compilation (if enabled) to happen
         vllm_runner(
             "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
@@ -151,7 +151,7 @@ def test_splitting_ops_dynamic():
     if is_torch_equal_or_newer("2.9.0.dev"):
         config = VllmConfig(
             compilation_config=CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                 use_inductor_graph_partition=True,
                 splitting_ops=["vllm::unified_attention"],
             )
@@ -163,7 +163,7 @@ def test_splitting_ops_dynamic():
     # When attn_fusion pass enabled, splitting_ops now default to attention ops.
     config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             pass_config={"enable_attn_fusion": True, "enable_noop": True},
             custom_ops=["+quant_fp8"],
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
@@ -178,7 +178,7 @@ def test_splitting_ops_dynamic():
     if is_torch_equal_or_newer("2.9.0.dev"):
         config = VllmConfig(
             compilation_config=CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                 use_inductor_graph_partition=True,
                 pass_config={"enable_attn_fusion": True, "enable_noop": True},
                 custom_ops=["+quant_fp8"],
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ def parse_args():`
`95`	`95`	`parser.add_argument(`
`96`	`96`	`"--compilation-config",`
`97`	`97`	`type=int,`
`98`		`- help=("Compilation optimization (O) level 0-3."),`
	`98`	`+ help=("Compilation optimization (O) mode 0-3."),`
`99`	`99`	`)`
`100`	`100`	`parser.add_argument(`
`101`	`101`	`"--quantization",`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`from vllm.compilation.decorators import support_torch_compile`
`11`	`11`	`from vllm.config import (`
`12`	`12`	`CompilationConfig,`
`13`		`- CompilationLevel,`
	`13`	`+ CompilationMode,`
`14`	`14`	`VllmConfig,`
`15`	`15`	`set_current_vllm_config,`
`16`	`16`	`)`
`@@ -38,7 +38,7 @@ def forward(self, x: torch.Tensor):`
`38`	`38`	`def make_vllm_config() -> VllmConfig:`
`39`	`39`	`return VllmConfig(`
`40`	`40`	`compilation_config=CompilationConfig(`
`41`		`- level=CompilationLevel.PIECEWISE,`
	`41`	`+ level=CompilationMode.VLLM_COMPILE,`
`42`	`42`	`)`
`43`	`43`	`)`
`44`	`44`