Skip to content

Commit 96b9aa5

Browse files
[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355)
Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
1 parent e66d787 commit 96b9aa5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+271
-249
lines changed

docs/configuration/conserving_memory.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
5858

5959
```python
6060
from vllm import LLM
61-
from vllm.config import CompilationConfig, CompilationLevel
61+
from vllm.config import CompilationConfig, CompilationMode
6262

6363
llm = LLM(
6464
model="meta-llama/Llama-3.1-8B-Instruct",
6565
compilation_config=CompilationConfig(
66-
level=CompilationLevel.PIECEWISE,
66+
mode=CompilationMode.VLLM_COMPILE,
6767
# By default, it goes up to max_num_seqs
6868
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
6969
),

docs/design/cuda_graphs.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
167167
"""NO CUDA Graphs support"""
168168
```
169169

170-
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
170+
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
171171

172172
The following table lists backends that support full CUDA Graphs at the time of writing.
173173

@@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
202202
import vllm
203203
from vllm.config import CUDAGraphMode
204204

205-
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
205+
compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
206206
model = vllm.LLM(
207207
model="meta-llama/Llama-3.1-8B-Instruct",
208208
dtype="auto",

examples/offline_inference/data_parallel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def parse_args():
9595
parser.add_argument(
9696
"--compilation-config",
9797
type=int,
98-
help=("Compilation optimization (O) level 0-3."),
98+
help=("Compilation optimization (O) mode 0-3."),
9999
)
100100
parser.add_argument(
101101
"--quantization",

tests/compile/piecewise/test_multiple_graphs.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
1515
from vllm.config import (
1616
CompilationConfig,
17-
CompilationLevel,
17+
CompilationMode,
1818
CUDAGraphMode,
1919
VllmConfig,
2020
set_current_vllm_config,
@@ -199,10 +199,10 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
199199

200200
outputs = []
201201

202-
# piecewise compile
202+
# vllmcompile compile
203203
vllm_config = VllmConfig(
204204
compilation_config=CompilationConfig(
205-
level=CompilationLevel.PIECEWISE,
205+
mode=CompilationMode.VLLM_COMPILE,
206206
use_cudagraph=True,
207207
splitting_ops=["silly::attention"],
208208
cudagraph_capture_sizes=[1, 2],
@@ -251,7 +251,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
251251
# no compile or cudagraph
252252
vllm_config = VllmConfig(
253253
compilation_config=CompilationConfig(
254-
level=CompilationLevel.NO_COMPILATION,
254+
mode=CompilationMode.NONE,
255255
)
256256
)
257257
cudagraph_runtime_mode = CUDAGraphMode.NONE
@@ -280,7 +280,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
280280
# piecewise compile without CUDA graph
281281
vllm_config = VllmConfig(
282282
compilation_config=CompilationConfig(
283-
level=CompilationLevel.PIECEWISE,
283+
mode=CompilationMode.VLLM_COMPILE,
284284
use_cudagraph=False,
285285
splitting_ops=["silly::attention"],
286286
use_inductor_graph_partition=use_inductor_graph_partition,

tests/compile/piecewise/test_simple.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from vllm.compilation.decorators import support_torch_compile
1414
from vllm.config import (
1515
CompilationConfig,
16-
CompilationLevel,
16+
CompilationMode,
1717
CUDAGraphMode,
1818
VllmConfig,
1919
set_current_vllm_config,
@@ -61,7 +61,7 @@ def _run_simple_model(
6161
):
6262
vllm_config = VllmConfig(
6363
compilation_config=CompilationConfig(
64-
level=CompilationLevel.PIECEWISE,
64+
mode=CompilationMode.VLLM_COMPILE,
6565
use_cudagraph=True,
6666
use_inductor=use_inductor,
6767
splitting_ops=splitting_ops,

tests/compile/piecewise/test_toy_llama.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from vllm.compilation.decorators import support_torch_compile
2222
from vllm.config import (
2323
CompilationConfig,
24-
CompilationLevel,
24+
CompilationMode,
2525
CUDAGraphMode,
2626
VllmConfig,
2727
set_current_vllm_config,
@@ -356,13 +356,13 @@ def test_toy_llama(
356356
)
357357

358358
compile_config_no_compile = CompilationConfig(
359-
level=CompilationLevel.NO_COMPILATION,
359+
level=CompilationMode.NONE,
360360
cudagraph_mode=CUDAGraphMode.NONE,
361361
backend="eager",
362362
)
363363

364364
compile_config_no_split = CompilationConfig(
365-
level=CompilationLevel.PIECEWISE,
365+
level=CompilationMode.VLLM_COMPILE,
366366
use_inductor_graph_partition=use_inductor_graph_partition,
367367
cudagraph_mode=CUDAGraphMode.PIECEWISE,
368368
backend=backend,
@@ -458,14 +458,14 @@ def benchmark():
458458
for piecewise in [False, True]:
459459
if piecewise:
460460
compilation_config = CompilationConfig(
461-
level=CompilationLevel.PIECEWISE,
461+
mode=CompilationMode.VLLM_COMPILE,
462462
use_cudagraph=True,
463463
splitting_ops=["silly::attention"],
464464
cudagraph_capture_sizes=cudagraph_sizes,
465465
)
466466
else:
467467
compilation_config = CompilationConfig(
468-
level=CompilationLevel.PIECEWISE,
468+
mode=CompilationMode.VLLM_COMPILE,
469469
cudagraph_capture_sizes=cudagraph_sizes,
470470
)
471471

tests/compile/test_aot_compile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from vllm.compilation.decorators import support_torch_compile
1111
from vllm.config import (
1212
CompilationConfig,
13-
CompilationLevel,
13+
CompilationMode,
1414
VllmConfig,
1515
set_current_vllm_config,
1616
)
@@ -38,7 +38,7 @@ def forward(self, x: torch.Tensor):
3838
def make_vllm_config() -> VllmConfig:
3939
return VllmConfig(
4040
compilation_config=CompilationConfig(
41-
level=CompilationLevel.PIECEWISE,
41+
level=CompilationMode.VLLM_COMPILE,
4242
)
4343
)
4444

tests/compile/test_async_tp.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from vllm.compilation.collective_fusion import AsyncTPPass
1111
from vllm.config import (
1212
CompilationConfig,
13+
CompilationMode,
1314
DeviceConfig,
1415
ModelConfig,
1516
PassConfig,
@@ -400,7 +401,7 @@ def test_async_tp_pass_correctness(
400401
common_args.append("--enforce-eager")
401402

402403
compilation_config = {
403-
"level": 3,
404+
"mode": CompilationMode.VLLM_COMPILE,
404405
"compile_sizes": [2, 4, 8],
405406
"splitting_ops": [],
406407
"pass_config": {"enable_async_tp": async_tp_enabled},

tests/compile/test_basic_correctness.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import pytest
66

7-
from vllm.config import CompilationLevel
7+
from vllm.config import CompilationMode
88
from vllm.utils import cuda_device_count_stateless
99

1010
from ..utils import compare_all_settings
@@ -21,7 +21,7 @@ class TestSetting:
2121

2222

2323
# we cannot afford testing the full Cartesian product
24-
# of all models and all levels
24+
# of all models and all modes
2525
@pytest.mark.parametrize(
2626
"test_setting",
2727
[
@@ -121,15 +121,13 @@ def test_compile_correctness(
121121
all_args: list[list[str]] = []
122122
all_envs: list[dict[str, str] | None] = []
123123

124-
for comp_level in [
125-
CompilationLevel.DYNAMO_AS_IS,
126-
CompilationLevel.DYNAMO_ONCE,
127-
CompilationLevel.PIECEWISE,
124+
for comp_mode in [
125+
CompilationMode.STOCK_TORCH_COMPILE,
126+
CompilationMode.DYNAMO_TRACE_ONCE,
127+
CompilationMode.VLLM_COMPILE,
128128
]:
129-
for level in [CompilationLevel.NO_COMPILATION, comp_level]:
130-
all_args.append(
131-
final_args + [f"-O.level={level}", "-O.backend=inductor"]
132-
)
129+
for mode in [CompilationMode.NONE, comp_mode]:
130+
all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
133131

134132
# inductor will change the output, so we only compare if the output
135133
# is close, not exactly the same.
@@ -142,13 +140,13 @@ def test_compile_correctness(
142140
all_envs.clear()
143141
all_args.clear()
144142

145-
for level in [
146-
CompilationLevel.NO_COMPILATION,
147-
CompilationLevel.DYNAMO_AS_IS,
148-
CompilationLevel.DYNAMO_ONCE,
149-
CompilationLevel.PIECEWISE,
143+
for mode in [
144+
CompilationMode.NONE,
145+
CompilationMode.STOCK_TORCH_COMPILE,
146+
CompilationMode.DYNAMO_TRACE_ONCE,
147+
CompilationMode.VLLM_COMPILE,
150148
]:
151-
all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
149+
all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
152150
all_envs.append({})
153151
all_envs.append({})
154152

tests/compile/test_config.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from vllm.compilation.counter import compilation_counter
66
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
7-
from vllm.config.compilation import CompilationLevel
7+
from vllm.config.compilation import CompilationMode
88
from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
99

1010

@@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
9090

9191
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
9292
@pytest.mark.forked
93-
def test_dynamo_as_is(vllm_runner, monkeypatch):
93+
def test_stock_torch_compile(vllm_runner, monkeypatch):
9494
# Disable multiprocessing so that the counter is in the same process
9595
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
9696

9797
with (
98-
compilation_counter.expect(dynamo_as_is_count=1),
98+
compilation_counter.expect(stock_torch_compile_count=1),
9999
# loading the model causes compilation (if enabled) to happen
100100
vllm_runner(
101101
"facebook/opt-125m",
102-
compilation_config={"level": 1},
102+
compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
103103
gpu_memory_utilization=0.4,
104104
) as _,
105105
):
@@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch):
112112
# Disable multiprocessing so that the counter is in the same process
113113
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
114114
with (
115-
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
115+
compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
116116
# loading the model causes compilation (if enabled) to happen
117117
vllm_runner(
118118
"facebook/opt-125m",
119-
compilation_config={"level": 0},
119+
compilation_config={"mode": CompilationMode.NONE},
120120
gpu_memory_utilization=0.4,
121121
) as _,
122122
):
@@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch):
130130
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
131131

132132
with (
133-
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
133+
compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
134134
# loading the model causes compilation (if enabled) to happen
135135
vllm_runner(
136136
"facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
@@ -151,7 +151,7 @@ def test_splitting_ops_dynamic():
151151
if is_torch_equal_or_newer("2.9.0.dev"):
152152
config = VllmConfig(
153153
compilation_config=CompilationConfig(
154-
level=CompilationLevel.PIECEWISE,
154+
level=CompilationMode.VLLM_COMPILE,
155155
use_inductor_graph_partition=True,
156156
splitting_ops=["vllm::unified_attention"],
157157
)
@@ -163,7 +163,7 @@ def test_splitting_ops_dynamic():
163163
# When attn_fusion pass enabled, splitting_ops now default to attention ops.
164164
config = VllmConfig(
165165
compilation_config=CompilationConfig(
166-
level=CompilationLevel.PIECEWISE,
166+
level=CompilationMode.VLLM_COMPILE,
167167
pass_config={"enable_attn_fusion": True, "enable_noop": True},
168168
custom_ops=["+quant_fp8"],
169169
cudagraph_mode=CUDAGraphMode.PIECEWISE,
@@ -178,7 +178,7 @@ def test_splitting_ops_dynamic():
178178
if is_torch_equal_or_newer("2.9.0.dev"):
179179
config = VllmConfig(
180180
compilation_config=CompilationConfig(
181-
level=CompilationLevel.PIECEWISE,
181+
level=CompilationMode.VLLM_COMPILE,
182182
use_inductor_graph_partition=True,
183183
pass_config={"enable_attn_fusion": True, "enable_noop": True},
184184
custom_ops=["+quant_fp8"],

0 commit comments

Comments
 (0)