Add e2e fusions to fullgraph test (should work with Triton backend), disable without flashinfer

ProExpertProg · ProExpertProg · commit 31d0127c71e7 · 2025-10-11T23:32:47.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -416,15 +416,16 @@ steps:
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s compile/piecewise/
 
-- label: PyTorch Fullgraph Test # 20min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 22min
+  timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/compile
   commands:
   - pytest -v -s compile/test_full_graph.py
+  - pytest -v -s compile/test_fusions_e2e.py
 
 - label: Kernels Core Operation Test # 48min
   timeout_in_minutes: 75
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
@@ -33,7 +33,7 @@
         )
     ]
 
-    if current_platform.is_device_capability((10, 0)):
+    if current_platform.is_device_capability((10, 0)) and has_flashinfer():
         MODELS_FP8 += [
             (
                 "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
@@ -97,7 +97,6 @@ def test_attn_quant(
 
     # Disable, compile cache to make sure custom passes run.
     # Otherwise, we can't verify fusion happened through the logs.
-    # Log capture also doesn't work with multiprocessing yet.
     monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
 
     # To capture subprocess logs, we need to know whether spawn or fork is used.
@@ -170,7 +169,6 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
 
     # Disable, compile cache to make sure custom passes run.
     # Otherwise, we can't verify fusion happened through the logs.
-    # Log capture also doesn't work with multiprocessing yet.
     monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
 
     # To capture subprocess logs, we need to know whether spawn or fork is used.

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`)`
`34`	`34`	`]`
`35`	`35`
`36`		`- if current_platform.is_device_capability((10, 0)):`
	`36`	`+ if current_platform.is_device_capability((10, 0)) and has_flashinfer():`
`37`	`37`	`MODELS_FP8 += [`
`38`	`38`	`(`
`39`	`39`	`"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",`
`@@ -97,7 +97,6 @@ def test_attn_quant(`
`97`	`97`
`98`	`98`	`# Disable, compile cache to make sure custom passes run.`
`99`	`99`	`# Otherwise, we can't verify fusion happened through the logs.`
`100`		`- # Log capture also doesn't work with multiprocessing yet.`
`101`	`100`	`monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")`
`102`	`101`
`103`	`102`	`# To capture subprocess logs, we need to know whether spawn or fork is used.`
`@@ -170,7 +169,6 @@ def test_tp2_attn_quant_allreduce_rmsnorm(`
`170`	`169`
`171`	`170`	`# Disable, compile cache to make sure custom passes run.`
`172`	`171`	`# Otherwise, we can't verify fusion happened through the logs.`
`173`		`- # Log capture also doesn't work with multiprocessing yet.`
`174`	`172`	`monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")`
`175`	`173`
`176`	`174`	`# To capture subprocess logs, we need to know whether spawn or fork is used.`