[CI] Enable Blackwell Llama4 MoE tests (vllm-project#26731)

mgoin · albertoperdomo2 · commit 8078189074a1 · 2025-10-16T17:18:03.000+01:00
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
Signed-off-by: Alberto Perdomo &lt;aperdomo@redhat.com&gt;
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -529,7 +529,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 53min
   timeout_in_minutes: 75
diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py
@@ -3,6 +3,7 @@
 
 import json
 import os
+from typing import Any
 
 import pytest
 
@@ -24,12 +25,21 @@ def set_test_environment():
     os.environ["FLASHINFER_NVCC_THREADS"] = "16"
 
 
-# dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4,
-# "text_config": {"num_layers": 4, "num_hidden_layers": 4}}
-dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
+# Overide the backbone layers to 4 for faster startup
+HF_OVERRIDE_TEXT = {
+    "num_layers": 4,
+    "num_hidden_layers": 4,
+}
+HF_OVERRIDE_MM = {
+    "text_config": {"num_layers": 4, "num_hidden_layers": 4},
+}
 
 
-def can_initialize(model: str, extra_args: list[str] | None = None):
+def can_initialize(
+    model: str,
+    hf_overrides: dict[str, Any] | None = None,
+    extra_args: list[str] | None = None,
+):
     # Server arguments
     extra_args = extra_args if extra_args is not None else []
     server_args = [
@@ -50,7 +60,7 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
         model,
         server_args,
         max_wait_seconds=1500,  # Due to FlashInfer compile
-        override_hf_configs=dummy_hf_overrides,
+        override_hf_configs=hf_overrides,
     ) as server:
         client = server.get_client()
         # Make a simple request to verify the server works
@@ -77,36 +87,41 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
 def test_llama4_fp8_tensor_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
-    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
+    can_initialize(
+        "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
+    )
 
 
-@pytest.mark.skip(reason="Works, but takes too long to run")
 def test_llama4_fp8_tensor_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
-    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
+    can_initialize(
+        "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
+    )
 
 
-@pytest.mark.skip(reason="Works, but takes too long to run")
 def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
-    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
+    can_initialize(
+        "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
+    )
 
 
-@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
 def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
-    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
+    can_initialize(
+        "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
+    )
 
 
 ## DeepSeekV3 ##
 
 
 def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
-    can_initialize("deepseek-ai/DeepSeek-V3.1")
+    can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
 
 
 @pytest.mark.skip(
@@ -118,41 +133,40 @@ def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
 def test_deepseek_fp8_block_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
-    can_initialize("deepseek-ai/DeepSeek-V3.1")
+    can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
 
 
 def test_deepseek_fp8_block_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
-    can_initialize("deepseek-ai/DeepSeek-V3.1")
+    can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
 
 
 def test_deepseek_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
-    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
+    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
 
 
-@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
 def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
-    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
+    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
 
 
 ## GPT-OSS ##
 
 
 def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
-    can_initialize("openai/gpt-oss-20b")
+    can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
 
 
 def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
-    can_initialize("openai/gpt-oss-20b")
+    can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
 
 
 def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
-    can_initialize("openai/gpt-oss-20b")
+    can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)