@@ -418,15 +418,16 @@ steps:
418418 - pytest -v -s compile/test_basic_correctness.py
419419 - pytest -v -s compile/piecewise/
420420
421- - label : PyTorch Fullgraph Test # 20min
422- timeout_in_minutes : 30
421+ - label : PyTorch Fullgraph Test # 22min
422+ timeout_in_minutes : 35
423423 mirror_hardwares : [amdexperimental]
424424 torch_nightly : true
425425 source_file_dependencies :
426426 - vllm/
427427 - tests/compile
428428 commands :
429429 - pytest -v -s compile/test_full_graph.py
430+ - pytest -v -s compile/test_fusions_e2e.py
430431
431432- label : Kernels Core Operation Test # 48min
432433 timeout_in_minutes : 75
@@ -808,8 +809,8 @@ steps:
808809 # Whisper needs spawn method to avoid deadlock
809810 - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
810811
811- - label : Blackwell Test # 38 min
812- timeout_in_minutes : 60
812+ - label : Blackwell Test # TODO min
813+ timeout_in_minutes : 70
813814 working_dir : " /vllm-workspace/"
814815 gpu : b200
815816 # optional: true
@@ -822,8 +823,6 @@ steps:
822823 - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
823824 - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
824825 - vllm/v1/attention/backends/flashinfer.py
825- - vllm/compilation/fusion.py
826- - vllm/compilation/fusion_attn.py
827826 commands :
828827 - nvidia-smi
829828 - python3 examples/offline_inference/basic/chat.py
@@ -840,15 +839,32 @@ steps:
840839 - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
841840 - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
842841 - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
842+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
843+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
843844 - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
844845 - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
845- # Fusion
846- - pytest -v -s tests/compile/test_fusion_all_reduce.py
847- - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
848846 - pytest -v -s tests/kernels/moe/test_flashinfer.py
847+
848+ - label : Blackwell Fusion Tests # TODO min
849+ timeout_in_minutes : 70
850+ working_dir : " /vllm-workspace/"
851+ gpu : b200
852+ source_file_dependencies :
853+ - csrc/quantization/fp4/
854+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
855+ - vllm/v1/attention/backends/flashinfer.py
856+ - vllm/compilation/
857+ # can affect pattern matching
858+ - vllm/model_executor/layers/layernorm.py
859+ - vllm/model_executor/layers/activation.py
860+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
861+ commands :
862+ - nvidia-smi
863+ - pytest -v -s tests/compile/test_fusion_attn.py
849864 - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
850- - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
851- - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
865+ # this runner has 2 GPUs available even though num_gpus=2 is not set
866+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
867+ - pytest -v -s tests/compile/test_fusions_e2e.py
852868
853869- label : Blackwell GPT-OSS Eval
854870 timeout_in_minutes : 60
@@ -1103,14 +1119,16 @@ steps:
11031119 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
11041120
11051121# #### H200 test #####
1106- - label : Distrubted Tests (H200) # optional
1122+ - label : Distributed Tests (H200) # optional
11071123 gpu : h200
11081124 optional : true
11091125 working_dir : " /vllm-workspace/"
11101126 num_gpus : 2
11111127 commands :
11121128 - pytest -v -s tests/compile/test_async_tp.py
11131129 - pytest -v -s tests/compile/test_sequence_parallelism.py
1130+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
1131+ - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
11141132 - pytest -v -s tests/distributed/test_context_parallel.py
11151133 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
11161134
0 commit comments