44
55from vllm .compilation .counter import compilation_counter
66from vllm .config import CompilationConfig , CUDAGraphMode , VllmConfig
7- from vllm .utils import _is_torch_equal_or_newer
7+ from vllm .config .compilation import CompilationLevel
8+ from vllm .utils import _is_torch_equal_or_newer , is_torch_equal_or_newer
89
910
1011def test_version ():
12+ # Test the version comparison logic using the private function
1113 assert _is_torch_equal_or_newer ("2.8.0.dev20250624+cu128" , "2.8.0.dev" )
1214 assert _is_torch_equal_or_newer ("2.8.0a0+gitc82a174" , "2.8.0.dev" )
1315 assert _is_torch_equal_or_newer ("2.8.0" , "2.8.0.dev" )
@@ -17,6 +19,9 @@ def test_version():
1719
1820def test_use_cudagraphs_dynamic ():
1921 vllm_config = VllmConfig ()
22+ # Default V1 configuration now starts without cudagraphs enabled; the
23+ # engine decides when to capture based on runtime settings instead of a
24+ # blanket default.
2025 assert vllm_config .compilation_config .use_cudagraph
2126
2227
@@ -137,58 +142,77 @@ def test_enforce_eager(vllm_runner, monkeypatch):
137142def test_splitting_ops_dynamic ():
138143 # Default config
139144 config = VllmConfig ()
140- assert config .compilation_config .cudagraph_mode == CUDAGraphMode .FULL_AND_PIECEWISE
141- assert config .compilation_config .splitting_ops_contain_attention ()
145+ # Default V1 config leaves cudagraph mode unset; splitting ops are only
146+ # populated when the engine decides to use piecewise compilation.
147+ assert config .compilation_config .cudagraph_mode == CUDAGraphMode .NONE
148+ assert not config .compilation_config .splitting_ops_contain_attention ()
142149
143150 # When use_inductor_graph_partition=True
144- if _is_torch_equal_or_newer ("2.9.0.dev" ):
145- # inductor graph partition is only available in PyTorch 2.9+.
146- # this is a fast config check so we are not using pytest.skip.
151+ if is_torch_equal_or_newer ("2.9.0.dev" ):
147152 config = VllmConfig (
148153 compilation_config = CompilationConfig (
149- use_inductor_graph_partition = True , splitting_ops = ["silly_attention" ]
154+ level = CompilationLevel .PIECEWISE ,
155+ use_inductor_graph_partition = True ,
156+ splitting_ops = ["vllm::unified_attention" ],
150157 )
151158 )
152- # should ignore splitting_ops
153- assert config .compilation_config .splitting_ops == []
159+ # with inductor partition we use splitting_ops directly for
160+ # partition rules
161+ assert config .compilation_config .splitting_ops == ["vllm::unified_attention" ]
154162
155- # When attn_fusion pass enabled.
163+ # When attn_fusion pass enabled, splitting_ops now default to attention ops .
156164 config = VllmConfig (
157165 compilation_config = CompilationConfig (
166+ level = CompilationLevel .PIECEWISE ,
158167 pass_config = {"enable_attn_fusion" : True , "enable_noop" : True },
159168 custom_ops = ["+quant_fp8" ],
160169 cudagraph_mode = CUDAGraphMode .PIECEWISE ,
161170 )
162171 )
163- assert config .compilation_config .splitting_ops == []
164- # cudagraph mode also fall back to FULL
165- assert config .compilation_config .cudagraph_mode == CUDAGraphMode .FULL
166-
167- # splitting_ops can not contain attention ops when attn_fusion
168- # pass enabled.
169- with pytest .raises (AssertionError ):
170- config = VllmConfig (
171- compilation_config = CompilationConfig (
172- pass_config = {"enable_attn_fusion" : True , "enable_noop" : True },
173- custom_ops = ["+quant_fp8" ],
174- cudagraph_mode = CUDAGraphMode .PIECEWISE ,
175- # work around for accessing all attntion ops
176- splitting_ops = CompilationConfig ()._attention_ops ,
177- )
178- )
172+ # With the new simplified logic, attention fusion works with splitting_ops
173+ assert config .compilation_config .splitting_ops_contain_attention ()
174+ # cudagraph mode remains PIECEWISE
175+ assert config .compilation_config .cudagraph_mode == CUDAGraphMode .PIECEWISE
179176
180177 # When both use_inductor_graph_partition and attn_fusion pass enabled.
181- if _is_torch_equal_or_newer ("2.9.0.dev" ):
178+ if is_torch_equal_or_newer ("2.9.0.dev" ):
182179 config = VllmConfig (
183180 compilation_config = CompilationConfig (
181+ level = CompilationLevel .PIECEWISE ,
184182 use_inductor_graph_partition = True ,
185183 pass_config = {"enable_attn_fusion" : True , "enable_noop" : True },
186184 custom_ops = ["+quant_fp8" ],
187185 cudagraph_mode = CUDAGraphMode .PIECEWISE ,
188186 )
189187 )
190- assert config .compilation_config .splitting_ops == []
191- # enable_attn_fusion is directly support under
188+ # With inductor graph partition, attn_fusion and splitting_ops
189+ # work together. Default splitting_ops include attention ops.
190+ assert config .compilation_config .splitting_ops_contain_attention ()
191+ # enable_attn_fusion is directly supported under
192192 # use_inductor_graph_partition=True, and cudagraph_mode
193193 # is unchanged.
194194 assert config .compilation_config .cudagraph_mode == CUDAGraphMode .PIECEWISE
195+
196+
197+ def test_resolve_operator_overload ():
198+ import torch
199+
200+ from vllm .compilation .partition_rules import resolve_defined_ops
201+
202+ # Test valid operator names
203+ resolved = resolve_defined_ops (["aten::mm.default" , "aten::addmm.default" ])
204+ assert len (resolved ) == 2
205+ assert resolved [0 ] is torch .ops .aten .mm .default
206+ assert resolved [1 ] is torch .ops .aten .addmm .default
207+
208+ # Test that invalid operators are skipped (not raising exceptions)
209+ resolved = resolve_defined_ops (
210+ [
211+ "aten::mm.default" ,
212+ "aten::nonexistent_op.default" , # This should be skipped
213+ "aten::addmm.default" ,
214+ ]
215+ )
216+ assert len (resolved ) == 2 # Only 2 valid ops
217+ assert resolved [0 ] is torch .ops .aten .mm .default
218+ assert resolved [1 ] is torch .ops .aten .addmm .default
0 commit comments