Skip to content

Commit 8078189

Browse files
mgoinalbertoperdomo2
authored andcommitted
[CI] Enable Blackwell Llama4 MoE tests (vllm-project#26731)
Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Alberto Perdomo <aperdomo@redhat.com>
1 parent 8d44454 commit 8078189

File tree

2 files changed

+36
-22
lines changed

2 files changed

+36
-22
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ steps:
529529
# we can only upgrade after this is resolved
530530
# TODO(jerryzh168): resolve the above comment
531531
- uv pip install --system torchao==0.13.0
532-
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
532+
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
533533

534534
- label: LM Eval Small Models # 53min
535535
timeout_in_minutes: 75

tests/quantization/test_blackwell_moe.py

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import json
55
import os
6+
from typing import Any
67

78
import pytest
89

@@ -24,12 +25,21 @@ def set_test_environment():
2425
os.environ["FLASHINFER_NVCC_THREADS"] = "16"
2526

2627

27-
# dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4,
28-
# "text_config": {"num_layers": 4, "num_hidden_layers": 4}}
29-
dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
28+
# Overide the backbone layers to 4 for faster startup
29+
HF_OVERRIDE_TEXT = {
30+
"num_layers": 4,
31+
"num_hidden_layers": 4,
32+
}
33+
HF_OVERRIDE_MM = {
34+
"text_config": {"num_layers": 4, "num_hidden_layers": 4},
35+
}
3036

3137

32-
def can_initialize(model: str, extra_args: list[str] | None = None):
38+
def can_initialize(
39+
model: str,
40+
hf_overrides: dict[str, Any] | None = None,
41+
extra_args: list[str] | None = None,
42+
):
3343
# Server arguments
3444
extra_args = extra_args if extra_args is not None else []
3545
server_args = [
@@ -50,7 +60,7 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
5060
model,
5161
server_args,
5262
max_wait_seconds=1500, # Due to FlashInfer compile
53-
override_hf_configs=dummy_hf_overrides,
63+
override_hf_configs=hf_overrides,
5464
) as server:
5565
client = server.get_client()
5666
# Make a simple request to verify the server works
@@ -77,36 +87,41 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
7787
def test_llama4_fp8_tensor_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
7888
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
7989
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
80-
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
90+
can_initialize(
91+
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
92+
)
8193

8294

83-
@pytest.mark.skip(reason="Works, but takes too long to run")
8495
def test_llama4_fp8_tensor_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
8596
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
8697
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
87-
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
98+
can_initialize(
99+
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
100+
)
88101

89102

90-
@pytest.mark.skip(reason="Works, but takes too long to run")
91103
def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
92104
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
93105
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
94-
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
106+
can_initialize(
107+
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
108+
)
95109

96110

97-
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
98111
def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
99112
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
100113
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
101-
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
114+
can_initialize(
115+
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
116+
)
102117

103118

104119
## DeepSeekV3 ##
105120

106121

107122
def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
108123
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
109-
can_initialize("deepseek-ai/DeepSeek-V3.1")
124+
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
110125

111126

112127
@pytest.mark.skip(
@@ -118,41 +133,40 @@ def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
118133
def test_deepseek_fp8_block_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
119134
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
120135
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
121-
can_initialize("deepseek-ai/DeepSeek-V3.1")
136+
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
122137

123138

124139
def test_deepseek_fp8_block_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
125140
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
126141
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
127-
can_initialize("deepseek-ai/DeepSeek-V3.1")
142+
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
128143

129144

130145
def test_deepseek_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
131146
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
132147
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
133-
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
148+
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
134149

135150

136-
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
137151
def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
138152
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
139153
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
140-
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
154+
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
141155

142156

143157
## GPT-OSS ##
144158

145159

146160
def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
147161
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
148-
can_initialize("openai/gpt-oss-20b")
162+
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
149163

150164

151165
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
152166
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
153-
can_initialize("openai/gpt-oss-20b")
167+
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
154168

155169

156170
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
157171
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
158-
can_initialize("openai/gpt-oss-20b")
172+
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)

0 commit comments

Comments
 (0)