replaced CompilationMode.PIECEWISE with CompilationMode.VLLM_COMPILE

morrison-turnansky · morrison-turnansky · commit af378794ceaa · 2025-10-07T13:07:55.000Z
Signed-off-by: morrison-turnansky &lt;mturnans@redhat.com&gt;
diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
@@ -66,7 +66,7 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
     llm = LLM(
         model="meta-llama/Llama-3.1-8B-Instruct",
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             # By default, it goes up to max_num_seqs
             cudagraph_capture_sizes=[1, 2, 4, 8, 16],
         ),
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
@@ -190,13 +190,13 @@ def run_model(
         return output.cpu()
 
 
-def test_multi_graph_piecewise_compile_outputs_equal():
+def test_multi_graph_vllmcompile_compile_outputs_equal():
     outputs = []
 
-    # piecewise compile
+    # vllmcompile compile
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly.attention"],
             cudagraph_capture_sizes=[1, 2],
@@ -265,7 +265,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
     # piecewise compile without CUDA graph
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             use_cudagraph=False,
             splitting_ops=["silly.attention"],
         )
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
@@ -62,7 +62,7 @@ def _run_simple_model(
 ):
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             use_inductor=use_inductor,
             splitting_ops=splitting_ops,
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
@@ -262,7 +262,7 @@ def run_model(
 ) -> torch.Tensor:
     if use_compile:
         compilation_config = CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             use_inductor=use_inductor,
             cudagraph_capture_sizes=[1, 2],
@@ -436,14 +436,14 @@ def benchmark():
     for piecewise in [False, True]:
         if piecewise:
             compilation_config = CompilationConfig(
-                level=CompilationMode.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                 use_cudagraph=True,
                 splitting_ops=["silly.attention"],
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
         else:
             compilation_config = CompilationConfig(
-                level=CompilationMode.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
 
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
@@ -124,7 +124,7 @@ def test_compile_correctness(
 
         for level in [
             CompilationMode.NO_COMPILATION,
-            CompilationMode.PIECEWISE,
+            CompilationMode.VLLM_COMPILE,
         ]:
             all_args.append(final_args + [f"-O{level}"])
             all_envs.append({})
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
@@ -66,10 +66,10 @@ def run_model(
 
 
 def test_ignore_torch_compile_decorator():
-    # piecewise
+    # vllmcompile
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly.attention"],
             cudagraph_capture_sizes=[1, 2],
@@ -184,7 +184,7 @@ def test_conditional_compile_enable_if():
             kv_sharing_fast_prefill=True,
         ),
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly.attention"],
             cudagraph_capture_sizes=[1, 2],
@@ -216,7 +216,7 @@ def test_conditional_compile_enable_if():
             kv_sharing_fast_prefill=False,
         ),
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly.attention"],
             cudagraph_capture_sizes=[1, 2],
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
@@ -83,7 +83,7 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
 
 @pytest.mark.parametrize(
     "optimization_level",
-    [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.PIECEWISE],
+    [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
 )
 @pytest.mark.parametrize("model_info", models_list(all=True))
 @create_new_process_for_each_test()
@@ -106,7 +106,7 @@ def test_full_graph(
     [
         # additional compile sizes, only some of the models
         (
-            CompilationConfig(level=CompilationMode.PIECEWISE, compile_sizes=[1, 2]),
+            CompilationConfig(level=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
             model,
         )
         for model in models_list(all=False)
@@ -115,7 +115,7 @@ def test_full_graph(
         # RMSNorm + quant fusion, only 8-bit quant models
         (
             CompilationConfig(
-                level=CompilationMode.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                 custom_ops=["+rms_norm"],
                 pass_config=PassConfig(enable_fusion=True, enable_noop=True),
             ),
@@ -127,7 +127,8 @@ def test_full_graph(
         # Test depyf integration works
         (
             CompilationConfig(
-                level=CompilationMode.PIECEWISE, debug_dump_path=tempfile.gettempdir()
+                level=CompilationMode.VLLM_COMPILE,
+                debug_dump_path=tempfile.gettempdir(),
             ),
             ("facebook/opt-125m", {}),
         ),
@@ -136,7 +137,7 @@ def test_full_graph(
         # graph inductor partition
         (
             CompilationConfig(
-                level=CompilationMode.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                 # inductor graph partition uses
                 # torch._C.Tag.cudagraph_unsafe to specify splitting ops
                 use_inductor_graph_partition=True,
@@ -167,7 +168,7 @@ def test_custom_compile_config(
 
 @pytest.mark.parametrize(
     "optimization_level",
-    [CompilationMode.NO_COMPILATION, CompilationMode.PIECEWISE],
+    [CompilationMode.NO_COMPILATION, CompilationMode.VLLM_COMPILE],
 )
 def test_fp8_kv_scale_compile(optimization_level: int):
     model = "Qwen/Qwen2-0.5B"
@@ -186,7 +187,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
 
     model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
     compilation_config = CompilationConfig(
-        level=CompilationMode.PIECEWISE,
+        level=CompilationMode.VLLM_COMPILE,
         use_inductor_graph_partition=True,
         cudagraph_mode=CUDAGraphMode.PIECEWISE,
         custom_ops=["+quant_fp8"],
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
@@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant(
 
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             custom_ops=["+rms_norm", "+quant_fp8"],
             pass_config=PassConfig(enable_fusion=True, enable_noop=True),
         )
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
@@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model(
 
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"]
+            level=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]
         )
     )
     vllm_config.compilation_config.pass_config = PassConfig(
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
@@ -431,7 +431,7 @@ def test_attention_quant_pattern(
         ),
         scheduler_config=SchedulerConfig(max_num_seqs=1024),
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             custom_ops=["+quant_fp8"],
             use_inductor_graph_partition=use_inductor_graph_partition,
         ),
diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py
@@ -42,7 +42,7 @@ def forward(self, x):
 
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             pass_config=PassConfig(enable_noop=True),
         )
     )
@@ -89,7 +89,7 @@ def forward(self, x):
 
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             pass_config=PassConfig(enable_noop=True),
         )
     )
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -42,7 +42,7 @@ def _create_vllm_config(
     mock_config.parallel_config = ParallelConfig()
 
     # Mimic the behavior of VllmConfig.__post_init__()
-    if compilation_config.level == CompilationMode.PIECEWISE:
+    if compilation_config.level == CompilationMode.VLLM_COMPILE:
         compilation_config.set_splitting_ops_for_v1()
 
     return mock_config
@@ -59,7 +59,7 @@ class TestCudagraphDispatcher:
             # Test case 2: Full CG for uniform batches, no CG for mixed
             (2, "FULL_DECODE_ONLY", CompilationMode.NO_COMPILATION),
             # Test case 3: Piecewise for all
-            (3, "PIECEWISE", CompilationMode.PIECEWISE),
+            (3, "PIECEWISE", CompilationMode.VLLM_COMPILE),
         ],
     )
     def test_dispatcher(self, cudagraph_mode_str, compilation_level):
@@ -242,7 +242,7 @@ class TestCudagraphIntegration:
     def setup_method(self):
         # only FULL mode for non-uniform batches
         self.comp_config = CompilationConfig(
-            level=CompilationMode.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             cudagraph_mode="FULL",
             cudagraph_capture_sizes=[10, 20],
         )
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -75,7 +75,7 @@ def test_kv_sharing_fast_prefill(
         # This allows vLLM compilation backend to handle allocating and
         # managing buffers for cudagraph
         cudagraph_copy_inputs=True,
-        level=CompilationMode.PIECEWISE
+        level=CompilationMode.VLLM_COMPILE
         if not enforce_eager
         else CompilationMode.NO_COMPILATION,
     )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -443,7 +443,7 @@ def set_model_tag(tag: str):
 
 class VllmBackend:
     """The compilation backend for `torch.compile` with vLLM.
-    It is used for compilation level of `CompilationMode.PIECEWISE`,
+    It is used for compilation level of `CompilationMode.VLLM_COMPILE`,
     where we customize the compilation.
 
     The major work of this backend is to split the graph into
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
@@ -18,7 +18,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
 
     compilation_config: CompilationConfig = vllm_config.compilation_config
     path = vllm_config.compile_debug_dump_path()
-    if compilation_config.level == CompilationMode.PIECEWISE and path:
+    if compilation_config.level == CompilationMode.VLLM_COMPILE and path:
         import depyf
 
         path.mkdir(parents=True, exist_ok=True)
@@ -29,7 +29,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
 
 def end_monitoring_torch_compile(vllm_config: VllmConfig):
     compilation_config: CompilationConfig = vllm_config.compilation_config
-    if compilation_config.level == CompilationMode.PIECEWISE:
+    if compilation_config.level == CompilationMode.VLLM_COMPILE:
         logger.info(
             "torch.compile takes %.2f s in total", compilation_config.compilation_time
         )
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -278,7 +278,7 @@ class CompilationConfig:
         that all input buffers have fixed addresses, and all
         splitting ops write their outputs to input buffers.
     In the vLLM V1 Engine, this flag only applies for
-    CompilationMode.PIECEWISE (aka -O3).
+    CompilationMode.VLLM_COMPILE (aka -O3).
     Note that this is orthogonal to the cudagraph capture logic
     outside of compilation.
     Warning: This flag is deprecated and will be removed in the next major or
@@ -542,7 +542,7 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
 
         # TODO: pass user-specified backend to piecewise compilation
         # merge with the config use_inductor
-        assert self.level == CompilationMode.PIECEWISE
+        assert self.level == CompilationMode.VLLM_COMPILE
 
         from vllm.compilation.backends import VllmBackend
 
@@ -604,10 +604,10 @@ def init_with_cudagraph_sizes(self, cudagraph_capture_sizes: list[int]) -> None:
 
     def set_splitting_ops_for_v1(self):
         # NOTE: this function needs to be called only when level is
-        # CompilationMode.PIECEWISE
-        assert self.level == CompilationMode.PIECEWISE, (
+        # CompilationMode.VLLM_COMPILE
+        assert self.level == CompilationMode.VLLM_COMPILE, (
             "set_splitting_ops_for_v1 should only be called when "
-            "level is CompilationMode.PIECEWISE"
+            "level is CompilationMode.VLLM_COMPILE"
         )
 
         if self.use_inductor_graph_partition:
@@ -690,12 +690,12 @@ def splitting_ops_contain_attention(self) -> bool:
 
     def is_attention_compiled_piecewise(self) -> bool:
         use_fx_graph_piecewise_compilation = (
-            self.level == CompilationMode.PIECEWISE
+            self.level == CompilationMode.VLLM_COMPILE
             and self.splitting_ops_contain_attention()
         )
 
         inductor_used = (
-            self.level == CompilationMode.PIECEWISE and self.use_inductor
+            self.level == CompilationMode.VLLM_COMPILE and self.use_inductor
         ) or (
             self.level >= CompilationMode.STOCK_TORCH_COMPILE
             and self.backend == "inductor"
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
@@ -310,7 +310,7 @@ def __post_init__(self):
                     self.model_config is not None
                     and not self.model_config.enforce_eager
                 ):
-                    self.compilation_config.level = CompilationMode.PIECEWISE
+                    self.compilation_config.level = CompilationMode.VLLM_COMPILE
                 else:
                     self.compilation_config.level = CompilationMode.NO_COMPILATION
 
@@ -332,7 +332,7 @@ def __post_init__(self):
             if self.compilation_config.cudagraph_mode is None:
                 if (
                     envs.VLLM_USE_V1
-                    and self.compilation_config.level == CompilationMode.PIECEWISE
+                    and self.compilation_config.level == CompilationMode.VLLM_COMPILE
                 ):
                     # default to full and piecewise for most models
                     self.compilation_config.cudagraph_mode = (
@@ -462,7 +462,7 @@ def __post_init__(self):
         # Do this after all the updates to compilation_config.level
         if (
             envs.VLLM_USE_V1
-            and self.compilation_config.level == CompilationMode.PIECEWISE
+            and self.compilation_config.level == CompilationMode.VLLM_COMPILE
         ):
             self.compilation_config.set_splitting_ops_for_v1()
 
@@ -481,8 +481,8 @@ def __post_init__(self):
                 )
 
             if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
-                assert self.compilation_config.level == CompilationMode.PIECEWISE, (
-                    "Compilation level should be CompilationMode.PIECEWISE "
+                assert self.compilation_config.level == CompilationMode.VLLM_COMPILE, (
+                    "Compilation level should be CompilationMode.VLLM_COMPILE "
                     "when cudagraph_mode piecewise cudagraphs is used, "
                     f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                 )
@@ -810,7 +810,7 @@ def set_current_vllm_config(
 
         if (
             check_compile
-            and vllm_config.compilation_config.level == CompilationMode.PIECEWISE
+            and vllm_config.compilation_config.level == CompilationMode.VLLM_COMPILE
             and compilation_counter.num_models_seen == num_models_seen
         ):
             # If the model supports compilation,
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
@@ -135,7 +135,7 @@ def default_on() -> bool:
 
         compilation_config = get_cached_compilation_config()
         default_on = (
-            compilation_config.level < CompilationMode.PIECEWISE
+            compilation_config.level < CompilationMode.VLLM_COMPILE
             or not compilation_config.use_inductor
         )
         count_none = compilation_config.custom_ops.count("none")
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -419,7 +419,7 @@ def __init__(
         if pad_output is None:
             config = get_current_vllm_config().compilation_config
             pad_output = (
-                config.level < CompilationMode.PIECEWISE
+                config.level < CompilationMode.VLLM_COMPILE
                 and self.preferred_backend == "torch"
             )
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py

Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant(`
`114`	`114`
`115`	`115`	`vllm_config = VllmConfig(`
`116`	`116`	`compilation_config=CompilationConfig(`
`117`		`- level=CompilationMode.PIECEWISE,`
	`117`	`+ level=CompilationMode.VLLM_COMPILE,`
`118`	`118`	`custom_ops=["+rms_norm", "+quant_fp8"],`
`119`	`119`	`pass_config=PassConfig(enable_fusion=True, enable_noop=True),`
`120`	`120`	`)`
Original file line number	Diff line number	Diff line change
`@@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model(`
`219`	`219`
`220`	`220`	`vllm_config = VllmConfig(`
`221`	`221`	`compilation_config=CompilationConfig(`
`222`		`- level=CompilationMode.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"]`
	`222`	`+ level=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]`
`223`	`223`	`)`
`224`	`224`	`)`
`225`	`225`	`vllm_config.compilation_config.pass_config = PassConfig(`