Skip to content

Commit e3fdb62

Browse files
morrison-turnanskyProExpertProgZJY0516
authored
[FrontEnd] UNREVERT CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26502)
Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
1 parent 7200a21 commit e3fdb62

File tree

8 files changed

+153
-86
lines changed

8 files changed

+153
-86
lines changed

tests/compile/piecewise/test_toy_llama.py

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -258,13 +258,13 @@ def tractable_computation(
258258

259259
@torch.inference_mode
260260
def run_model(
261-
llama_config, use_compile: bool, use_inductor: bool, split_attn: bool = False
261+
llama_config, use_compile: bool, backend: str, split_attn: bool = False
262262
) -> torch.Tensor:
263263
if use_compile:
264264
compilation_config = CompilationConfig(
265265
level=CompilationLevel.PIECEWISE,
266266
use_cudagraph=True,
267-
use_inductor=use_inductor,
267+
backend=backend,
268268
cudagraph_capture_sizes=[1, 2],
269269
)
270270
if split_attn:
@@ -338,8 +338,8 @@ def run_model(
338338
return output.cpu()
339339

340340

341-
@pytest.mark.parametrize("use_inductor", [True, False])
342-
def test_toy_llama(use_inductor: bool):
341+
@pytest.mark.parametrize("backend", ["inductor", "eager"])
342+
def test_toy_llama(backend: str):
343343
# compare output with and without piecewise compilation
344344

345345
llama_config = LlamaConfig(
@@ -358,10 +358,10 @@ def test_toy_llama(use_inductor: bool):
358358
num_backend_compilations=0,
359359
num_cudagraph_captured=0,
360360
):
361-
outputs.append(run_model(llama_config, use_inductor=False, use_compile=False))
362-
run_model(tractable_config, use_inductor=False, use_compile=False)
361+
outputs.append(run_model(llama_config, backend="eager", use_compile=False))
362+
run_model(tractable_config, backend="eager", use_compile=False)
363363

364-
if use_inductor:
364+
if backend == "inductor":
365365
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
366366
else:
367367
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
@@ -377,10 +377,8 @@ def test_toy_llama(use_inductor: bool):
377377
num_cudagraph_captured=2,
378378
**kwargs,
379379
):
380-
outputs.append(
381-
run_model(llama_config, use_inductor=use_inductor, use_compile=True)
382-
)
383-
run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
380+
outputs.append(run_model(llama_config, backend=backend, use_compile=True))
381+
run_model(tractable_config, backend=backend, use_compile=True)
384382

385383
with compilation_counter.expect(
386384
num_graphs_seen=1, # one graph for the model
@@ -395,16 +393,9 @@ def test_toy_llama(use_inductor: bool):
395393
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
396394
):
397395
outputs.append(
398-
run_model(
399-
llama_config,
400-
use_inductor=use_inductor,
401-
use_compile=True,
402-
split_attn=True,
403-
)
396+
run_model(llama_config, backend=backend, use_compile=True, split_attn=True)
404397
)
405-
run_model(
406-
tractable_config, use_inductor=use_inductor, use_compile=True, split_attn=True
407-
)
398+
run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)
408399

409400
for i in range(1, len(outputs)):
410401
assert torch.allclose(outputs[0], outputs[i])

tests/compile/test_basic_correctness.py

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,15 @@ class TestSetting:
7777
method="encode",
7878
),
7979
# vision language model
80-
TestSetting(
81-
model="microsoft/Phi-3.5-vision-instruct",
82-
model_args=["--trust-remote-code", "--max-model-len", "2048"],
83-
pp_size=2,
84-
tp_size=1,
85-
attn_backend="FLASH_ATTN",
86-
method="generate_with_image",
87-
),
80+
# See https://github.com/vllm-project/vllm/issues/26716.
81+
# TestSetting(
82+
# model="microsoft/Phi-3.5-vision-instruct",
83+
# model_args=["--trust-remote-code", "--max-model-len", "2048"],
84+
# pp_size=2,
85+
# tp_size=1,
86+
# attn_backend="FLASH_ATTN",
87+
# method="generate_with_image",
88+
# ),
8889
],
8990
)
9091
def test_compile_correctness(
@@ -109,41 +110,46 @@ def test_compile_correctness(
109110
with monkeypatch.context() as m:
110111
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
111112
final_args = [
112-
"--enforce-eager",
113113
*model_args,
114114
"-pp",
115115
str(pp_size),
116116
"-tp",
117117
str(tp_size),
118+
"-O.cudagraph_mode=none",
118119
]
119120

120121
all_args: list[list[str]] = []
121122
all_envs: list[dict[str, str] | None] = []
122123

123-
for level in [
124-
CompilationLevel.NO_COMPILATION,
124+
for comp_level in [
125+
CompilationLevel.DYNAMO_AS_IS,
126+
CompilationLevel.DYNAMO_ONCE,
125127
CompilationLevel.PIECEWISE,
126128
]:
127-
all_args.append(final_args + [f"-O{level}"])
128-
all_envs.append({})
129+
for level in [CompilationLevel.NO_COMPILATION, comp_level]:
130+
all_args.append(
131+
final_args + [f"-O.level={level}", "-O.backend=inductor"]
132+
)
129133

130-
# inductor will change the output, so we only compare if the output
131-
# is close, not exactly the same.
132-
compare_all_settings(
133-
model,
134-
all_args,
135-
all_envs,
136-
method=method if method != "generate" else "generate_close",
137-
)
138-
all_envs.clear()
139-
all_args.clear()
134+
# inductor will change the output, so we only compare if the output
135+
# is close, not exactly the same.
136+
compare_all_settings(
137+
model,
138+
all_args,
139+
all_envs,
140+
method=method if method != "generate" else "generate_close",
141+
)
142+
all_envs.clear()
143+
all_args.clear()
140144

141145
for level in [
142146
CompilationLevel.NO_COMPILATION,
143147
CompilationLevel.DYNAMO_AS_IS,
144148
CompilationLevel.DYNAMO_ONCE,
149+
CompilationLevel.PIECEWISE,
145150
]:
146-
all_args.append(final_args + [f"-O{level}"])
151+
all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
152+
all_envs.append({})
147153
all_envs.append({})
148154

149155
compare_all_settings(model, all_args * 3, all_envs, method=method)

tests/model_executor/test_enabled_custom_ops.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,55 +36,56 @@ class Relu3(ReLUSquaredActivation):
3636

3737

3838
@pytest.mark.parametrize(
39-
"env, torch_level, use_inductor, ops_enabled, default_on",
39+
"env, torch_level, backend, ops_enabled, default_on",
4040
[
4141
# Default values based on compile level
4242
# - All by default (no Inductor compilation)
43-
(None, 0, False, [True] * 4, True),
44-
(None, 1, True, [True] * 4, True),
45-
(None, 2, False, [True] * 4, True),
43+
(None, 0, "eager", [True] * 4, True),
44+
(None, 1, "eager", [True] * 4, True),
45+
(None, 2, "eager", [True] * 4, True),
46+
(None, 3, "eager", [True] * 4, True),
4647
# - None by default (with Inductor)
47-
(None, 3, True, [False] * 4, False),
48-
(None, 4, True, [False] * 4, False),
49-
# - All by default (without Inductor)
50-
(None, 3, False, [True] * 4, True),
51-
(None, 4, False, [True] * 4, True),
48+
(None, 0, "inductor", [True] * 4, True),
49+
# - None by default (with Inductor)
50+
(None, 1, "inductor", [False] * 4, False),
51+
(None, 2, "inductor", [False] * 4, False),
52+
(None, 3, "inductor", [False] * 4, False),
5253
# Explicitly enabling/disabling
5354
#
5455
# Default: all
5556
#
5657
# All but SiluAndMul
57-
("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True),
58+
("+rms_norm,-silu_and_mul", 0, "inductor", [1, 0, 1, 1], True),
5859
# Only ReLU3
59-
("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False),
60+
("none,-rms_norm,+relu3", 1, "eager", [0, 0, 0, 1], False),
6061
# All but SiluAndMul
61-
("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True),
62+
("all,-silu_and_mul", 2, "inductor", [1, 0, 1, 1], True),
6263
# All but ReLU3 (even if ReLU2 is on)
63-
("-relu3,+relu2", 3, False, [1, 1, 1, 0], True),
64+
("-relu3,+relu2", 3, "eager", [1, 1, 1, 0], True),
6465
# RMSNorm and SiluAndMul
65-
("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False),
66+
("none,-relu3,+rms_norm,+silu_and_mul", 3, "eager", [1, 1, 0, 0], False),
6667
# All but RMSNorm
67-
("-rms_norm", 3, False, [0, 1, 1, 1], True),
68+
("-rms_norm", 3, "eager", [0, 1, 1, 1], True),
6869
#
6970
# Default: none
7071
#
7172
# Only ReLU3
72-
("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False),
73+
("none,+relu3", 3, "inductor", [0, 0, 0, 1], False),
7374
# All but RMSNorm
74-
("all,-rms_norm", 4, True, [0, 1, 1, 1], True),
75+
("all,-rms_norm", 3, "inductor", [0, 1, 1, 1], True),
7576
],
7677
)
7778
def test_enabled_ops(
7879
env: str | None,
7980
torch_level: int,
80-
use_inductor: bool,
81+
backend: str,
8182
ops_enabled: list[int],
8283
default_on: bool,
8384
):
8485
custom_ops = env.split(",") if env else []
8586
vllm_config = VllmConfig(
8687
compilation_config=CompilationConfig(
87-
use_inductor=bool(use_inductor), level=torch_level, custom_ops=custom_ops
88+
backend=backend, level=torch_level, custom_ops=custom_ops
8889
)
8990
)
9091
with set_current_vllm_config(vllm_config):

vllm/compilation/backends.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141

4242

4343
def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
44-
if compilation_config.use_inductor:
44+
if compilation_config.backend == "inductor":
4545
# Use standalone compile only if requested, version is new enough,
4646
# and the symbol actually exists in this PyTorch build.
4747
if (
@@ -55,6 +55,10 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
5555
logger.debug("Using InductorAdaptor")
5656
return InductorAdaptor()
5757
else:
58+
assert compilation_config.backend == "eager", (
59+
"Custom backends not supported with CompilationLevel.PIECEWISE"
60+
)
61+
5862
logger.debug("Using EagerAdaptor")
5963
return EagerAdaptor()
6064

vllm/config/compilation.py

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
1616
from vllm.config.utils import config
1717
from vllm.logger import init_logger
18+
from vllm.platforms import current_platform
1819
from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
1920

2021
if TYPE_CHECKING:
@@ -187,7 +188,8 @@ class CompilationConfig:
187188
backend: str = ""
188189
"""The backend for compilation. It needs to be a string:
189190
190-
- "" (empty string): use the default backend.
191+
- "" (empty string): use the default backend ("inductor" on CUDA-alike
192+
platforms).
191193
- "eager"/"openxla"/...: use the specified backend registered in PyTorch.
192194
- "full.module.name": a qualified name which can be used to import the
193195
@@ -196,7 +198,12 @@ class CompilationConfig:
196198
distributed setting. When the compilation level is 1 or 2, the backend is
197199
used for the compilation directly (it sees the whole graph). When the
198200
compilation level is 3, the backend is used for the piecewise compilation
199-
(it sees a part of the graph)."""
201+
(it sees a part of the graph). The backend can not be custom for compilation
202+
level 3, i.e. the backend must be either eager or inductor. Furthermore,
203+
compilation is only piecewise if splitting ops is set accordingly and
204+
use_inductor_cudagraphs_partition is off. Note that the default options for
205+
splitting ops are sufficient for piecewise compilation.
206+
"""
200207
custom_ops: list[str] = field(default_factory=list)
201208
"""Fine-grained control over which custom ops to enable/disable. Use 'all'
202209
to enable all, 'none' to disable all. Also specify a list of custom op
@@ -229,16 +236,24 @@ class CompilationConfig:
229236
If empty list [], no ops are excluded (suitable for full cudagraphs)."""
230237

231238
# Inductor capture
232-
use_inductor: bool = True
233-
"""Whether to use inductor compilation:
239+
use_inductor: bool | None = None
240+
"""
241+
Whether to use inductor compilation.
242+
243+
This flag is deprecated and will be removed in the next release 0.12.0.
244+
Please use the 'backend' option instead.
234245
235246
- False: inductor compilation is not used. graph runs in eager
236247
(custom_ops enabled by default).
237248
- True: inductor compilation is used (custom_ops disabled by default).
238249
One graph for symbolic shape and one graph per size in compile_sizes
239250
are compiled using configurations in inductor_compile_config.
240251
241-
This setting is ignored if level<PIECEWISE."""
252+
This setting is ignored if level<PIECEWISE.
253+
254+
For future compatibility:
255+
If use_inductor is True, backend="inductor" otherwise backend="eager".
256+
"""
242257
compile_sizes: list[int | str] | None = None
243258
"""Sizes to compile for inductor. In addition
244259
to integers, it also supports "cudagraph_capture_sizes" to
@@ -545,23 +560,59 @@ def __post_init__(self) -> None:
545560
"(where 'op' is the registered op name)"
546561
)
547562

563+
# Currently only eager and inductor backend are supported.
564+
# for piecewise compilation. Custom backends are not suppported for
565+
# piecewise compilation. Update when more backends are supported.
566+
if self.level == CompilationLevel.PIECEWISE and self.backend not in [
567+
"",
568+
"eager",
569+
"inductor",
570+
]:
571+
raise ValueError(
572+
f"Invalid backend for piecewise compilation: {self.backend}"
573+
)
574+
575+
if self.use_inductor is not None:
576+
logger.warning_once(
577+
"The 'use_inductor' flag is deprecated and will be "
578+
"removed in the next release (v0.12.0). "
579+
"Please use the 'backend' option instead.",
580+
)
581+
self.backend = "inductor" if self.use_inductor else "eager"
582+
583+
if self.backend == "":
584+
self.backend = current_platform.simple_compile_backend
585+
548586
def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
587+
"""
588+
Initialize the backend for the compilation config from a vllm config.
589+
Arguments:
590+
vllm_config: The vllm config to initialize the backend from.
591+
Returns:
592+
The backend for the compilation config.
593+
"""
594+
if self.level is None:
595+
raise ValueError(
596+
"No compilation level is set. This method should only be \
597+
called via vllm config where the level is set if none is \
598+
provided."
599+
)
549600
if self.level == CompilationLevel.NO_COMPILATION:
550601
raise ValueError("No compilation level is set.")
551602

552603
from torch._dynamo.backends.registry import list_backends
553604

554605
torch_backends = list_backends(exclude_tags=tuple())
555606
if self.level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
556-
if self.backend == "":
557-
return "eager"
558607
if self.backend in torch_backends:
559608
return self.backend
560609
return resolve_obj_by_qualname(self.backend)
561610

562-
# TODO: pass user-specified backend to piecewise compilation
563-
# merge with the config use_inductor
564611
assert self.level == CompilationLevel.PIECEWISE
612+
if self.backend not in ["eager", "inductor"]:
613+
raise ValueError(
614+
f"Invalid backend for piecewise compilation: {self.backend}"
615+
)
565616

566617
from vllm.compilation.backends import VllmBackend
567618

@@ -710,7 +761,9 @@ def is_attention_compiled_piecewise(self) -> bool:
710761
return self.level == CompilationLevel.PIECEWISE
711762

712763
# Inductor partition case
713-
return self.level > CompilationLevel.NO_COMPILATION and self.use_inductor
764+
return (
765+
self.backend == "inductor" and self.level > CompilationLevel.NO_COMPILATION
766+
)
714767

715768
def custom_op_log_check(self):
716769
"""

0 commit comments

Comments
 (0)