3535)
3636from vllm .model_executor .layers .quantization .utils .w8a8_utils import Fp8LinearOp
3737from vllm .platforms import current_platform
38- from vllm .utils import is_torch_equal_or_newer
3938from vllm .utils .flashinfer import has_flashinfer
4039from vllm .v1 .kv_cache_interface import AttentionSpec
4140
@@ -290,7 +289,6 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
290289 # quant_fp4 only has the custom impl
291290 + list (flat_product (BACKENDS_FP4 , MODELS_FP4 , ["" ])),
292291)
293- @pytest .mark .parametrize ("use_inductor_graph_partition" , [True , False ])
294292@pytest .mark .skipif (
295293 not current_platform .is_cuda_alike (), reason = "Only test ROCm or CUDA"
296294)
@@ -305,7 +303,6 @@ def test_attention_quant_pattern(
305303 model_name : str ,
306304 model_class : type [AttentionQuantPatternModel ],
307305 backend : _Backend ,
308- use_inductor_graph_partition : bool ,
309306 dist_init ,
310307):
311308 """Test AttentionStaticQuantPattern fusion pass"""
@@ -314,10 +311,6 @@ def test_attention_quant_pattern(
314311 ):
315312 pytest .skip ("FlashInfer attn fusion requires Blackwell and flashinfer" )
316313
317- # TODO(boyuan/luka): test inductor graph partition on rocm
318- if use_inductor_graph_partition and not is_torch_equal_or_newer ("2.9.0.dev" ):
319- pytest .skip ("Inductor graph partition requires torch>=2.9" )
320-
321314 custom_ops_list = custom_ops .split ("," ) if custom_ops else []
322315
323316 device = torch .device ("cuda:0" )
@@ -333,7 +326,6 @@ def test_attention_quant_pattern(
333326 compilation_config = CompilationConfig (
334327 mode = CompilationMode .VLLM_COMPILE ,
335328 custom_ops = custom_ops_list ,
336- use_inductor_graph_partition = use_inductor_graph_partition ,
337329 ),
338330 cache_config = CacheConfig (cache_dtype = "fp8" ),
339331 )
0 commit comments