Skip to content

Commit af37879

Browse files
replaced CompilationMode.PIECEWISE with CompilationMode.VLLM_COMPILE
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
1 parent 5655663 commit af37879

23 files changed

+54
-52
lines changed

docs/configuration/conserving_memory.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
6666
llm = LLM(
6767
model="meta-llama/Llama-3.1-8B-Instruct",
6868
compilation_config=CompilationConfig(
69-
level=CompilationMode.PIECEWISE,
69+
level=CompilationMode.VLLM_COMPILE,
7070
# By default, it goes up to max_num_seqs
7171
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
7272
),

tests/compile/piecewise/test_multiple_graphs.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,13 +190,13 @@ def run_model(
190190
return output.cpu()
191191

192192

193-
def test_multi_graph_piecewise_compile_outputs_equal():
193+
def test_multi_graph_vllmcompile_compile_outputs_equal():
194194
outputs = []
195195

196-
# piecewise compile
196+
# vllmcompile compile
197197
vllm_config = VllmConfig(
198198
compilation_config=CompilationConfig(
199-
level=CompilationMode.PIECEWISE,
199+
level=CompilationMode.VLLM_COMPILE,
200200
use_cudagraph=True,
201201
splitting_ops=["silly.attention"],
202202
cudagraph_capture_sizes=[1, 2],
@@ -265,7 +265,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
265265
# piecewise compile without CUDA graph
266266
vllm_config = VllmConfig(
267267
compilation_config=CompilationConfig(
268-
level=CompilationMode.PIECEWISE,
268+
level=CompilationMode.VLLM_COMPILE,
269269
use_cudagraph=False,
270270
splitting_ops=["silly.attention"],
271271
)

tests/compile/piecewise/test_simple.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def _run_simple_model(
6262
):
6363
vllm_config = VllmConfig(
6464
compilation_config=CompilationConfig(
65-
level=CompilationMode.PIECEWISE,
65+
level=CompilationMode.VLLM_COMPILE,
6666
use_cudagraph=True,
6767
use_inductor=use_inductor,
6868
splitting_ops=splitting_ops,

tests/compile/piecewise/test_toy_llama.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ def run_model(
262262
) -> torch.Tensor:
263263
if use_compile:
264264
compilation_config = CompilationConfig(
265-
level=CompilationMode.PIECEWISE,
265+
level=CompilationMode.VLLM_COMPILE,
266266
use_cudagraph=True,
267267
use_inductor=use_inductor,
268268
cudagraph_capture_sizes=[1, 2],
@@ -436,14 +436,14 @@ def benchmark():
436436
for piecewise in [False, True]:
437437
if piecewise:
438438
compilation_config = CompilationConfig(
439-
level=CompilationMode.PIECEWISE,
439+
level=CompilationMode.VLLM_COMPILE,
440440
use_cudagraph=True,
441441
splitting_ops=["silly.attention"],
442442
cudagraph_capture_sizes=cudagraph_sizes,
443443
)
444444
else:
445445
compilation_config = CompilationConfig(
446-
level=CompilationMode.PIECEWISE,
446+
level=CompilationMode.VLLM_COMPILE,
447447
cudagraph_capture_sizes=cudagraph_sizes,
448448
)
449449

tests/compile/test_basic_correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def test_compile_correctness(
124124

125125
for level in [
126126
CompilationMode.NO_COMPILATION,
127-
CompilationMode.PIECEWISE,
127+
CompilationMode.VLLM_COMPILE,
128128
]:
129129
all_args.append(final_args + [f"-O{level}"])
130130
all_envs.append({})

tests/compile/test_decorator.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@ def run_model(
6666

6767

6868
def test_ignore_torch_compile_decorator():
69-
# piecewise
69+
# vllmcompile
7070
vllm_config = VllmConfig(
7171
compilation_config=CompilationConfig(
72-
level=CompilationMode.PIECEWISE,
72+
level=CompilationMode.VLLM_COMPILE,
7373
use_cudagraph=True,
7474
splitting_ops=["silly.attention"],
7575
cudagraph_capture_sizes=[1, 2],
@@ -184,7 +184,7 @@ def test_conditional_compile_enable_if():
184184
kv_sharing_fast_prefill=True,
185185
),
186186
compilation_config=CompilationConfig(
187-
level=CompilationMode.PIECEWISE,
187+
level=CompilationMode.VLLM_COMPILE,
188188
use_cudagraph=True,
189189
splitting_ops=["silly.attention"],
190190
cudagraph_capture_sizes=[1, 2],
@@ -216,7 +216,7 @@ def test_conditional_compile_enable_if():
216216
kv_sharing_fast_prefill=False,
217217
),
218218
compilation_config=CompilationConfig(
219-
level=CompilationMode.PIECEWISE,
219+
level=CompilationMode.VLLM_COMPILE,
220220
use_cudagraph=True,
221221
splitting_ops=["silly.attention"],
222222
cudagraph_capture_sizes=[1, 2],

tests/compile/test_full_graph.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
8383

8484
@pytest.mark.parametrize(
8585
"optimization_level",
86-
[CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.PIECEWISE],
86+
[CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
8787
)
8888
@pytest.mark.parametrize("model_info", models_list(all=True))
8989
@create_new_process_for_each_test()
@@ -106,7 +106,7 @@ def test_full_graph(
106106
[
107107
# additional compile sizes, only some of the models
108108
(
109-
CompilationConfig(level=CompilationMode.PIECEWISE, compile_sizes=[1, 2]),
109+
CompilationConfig(level=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
110110
model,
111111
)
112112
for model in models_list(all=False)
@@ -115,7 +115,7 @@ def test_full_graph(
115115
# RMSNorm + quant fusion, only 8-bit quant models
116116
(
117117
CompilationConfig(
118-
level=CompilationMode.PIECEWISE,
118+
level=CompilationMode.VLLM_COMPILE,
119119
custom_ops=["+rms_norm"],
120120
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
121121
),
@@ -127,7 +127,8 @@ def test_full_graph(
127127
# Test depyf integration works
128128
(
129129
CompilationConfig(
130-
level=CompilationMode.PIECEWISE, debug_dump_path=tempfile.gettempdir()
130+
level=CompilationMode.VLLM_COMPILE,
131+
debug_dump_path=tempfile.gettempdir(),
131132
),
132133
("facebook/opt-125m", {}),
133134
),
@@ -136,7 +137,7 @@ def test_full_graph(
136137
# graph inductor partition
137138
(
138139
CompilationConfig(
139-
level=CompilationMode.PIECEWISE,
140+
level=CompilationMode.VLLM_COMPILE,
140141
# inductor graph partition uses
141142
# torch._C.Tag.cudagraph_unsafe to specify splitting ops
142143
use_inductor_graph_partition=True,
@@ -167,7 +168,7 @@ def test_custom_compile_config(
167168

168169
@pytest.mark.parametrize(
169170
"optimization_level",
170-
[CompilationMode.NO_COMPILATION, CompilationMode.PIECEWISE],
171+
[CompilationMode.NO_COMPILATION, CompilationMode.VLLM_COMPILE],
171172
)
172173
def test_fp8_kv_scale_compile(optimization_level: int):
173174
model = "Qwen/Qwen2-0.5B"
@@ -186,7 +187,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
186187

187188
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
188189
compilation_config = CompilationConfig(
189-
level=CompilationMode.PIECEWISE,
190+
level=CompilationMode.VLLM_COMPILE,
190191
use_inductor_graph_partition=True,
191192
cudagraph_mode=CUDAGraphMode.PIECEWISE,
192193
custom_ops=["+quant_fp8"],

tests/compile/test_fusion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant(
114114

115115
vllm_config = VllmConfig(
116116
compilation_config=CompilationConfig(
117-
level=CompilationMode.PIECEWISE,
117+
level=CompilationMode.VLLM_COMPILE,
118118
custom_ops=["+rms_norm", "+quant_fp8"],
119119
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
120120
)

tests/compile/test_fusion_all_reduce.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model(
219219

220220
vllm_config = VllmConfig(
221221
compilation_config=CompilationConfig(
222-
level=CompilationMode.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"]
222+
level=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]
223223
)
224224
)
225225
vllm_config.compilation_config.pass_config = PassConfig(

tests/compile/test_fusion_attn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ def test_attention_quant_pattern(
431431
),
432432
scheduler_config=SchedulerConfig(max_num_seqs=1024),
433433
compilation_config=CompilationConfig(
434-
level=CompilationMode.PIECEWISE,
434+
level=CompilationMode.VLLM_COMPILE,
435435
custom_ops=["+quant_fp8"],
436436
use_inductor_graph_partition=use_inductor_graph_partition,
437437
),

0 commit comments

Comments
 (0)