33
44import json
55import os
6+ from typing import Any
67
78import pytest
89
@@ -24,12 +25,21 @@ def set_test_environment():
2425 os .environ ["FLASHINFER_NVCC_THREADS" ] = "16"
2526
2627
27- # dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4,
28- # "text_config": {"num_layers": 4, "num_hidden_layers": 4}}
29- dummy_hf_overrides = {"num_layers" : 4 , "num_hidden_layers" : 4 }
28+ # Overide the backbone layers to 4 for faster startup
29+ HF_OVERRIDE_TEXT = {
30+ "num_layers" : 4 ,
31+ "num_hidden_layers" : 4 ,
32+ }
33+ HF_OVERRIDE_MM = {
34+ "text_config" : {"num_layers" : 4 , "num_hidden_layers" : 4 },
35+ }
3036
3137
32- def can_initialize (model : str , extra_args : list [str ] | None = None ):
38+ def can_initialize (
39+ model : str ,
40+ hf_overrides : dict [str , Any ] | None = None ,
41+ extra_args : list [str ] | None = None ,
42+ ):
3343 # Server arguments
3444 extra_args = extra_args if extra_args is not None else []
3545 server_args = [
@@ -50,7 +60,7 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
5060 model ,
5161 server_args ,
5262 max_wait_seconds = 1500 , # Due to FlashInfer compile
53- override_hf_configs = dummy_hf_overrides ,
63+ override_hf_configs = hf_overrides ,
5464 ) as server :
5565 client = server .get_client ()
5666 # Make a simple request to verify the server works
@@ -77,36 +87,41 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
7787def test_llama4_fp8_tensor_moe_flashinfer_cutlass (monkeypatch : pytest .MonkeyPatch ):
7888 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_FP8" , "1" )
7989 monkeypatch .setenv ("VLLM_FLASHINFER_MOE_BACKEND" , "throughput" )
80- can_initialize ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" )
90+ can_initialize (
91+ "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" , hf_overrides = HF_OVERRIDE_MM
92+ )
8193
8294
83- @pytest .mark .skip (reason = "Works, but takes too long to run" )
8495def test_llama4_fp8_tensor_moe_flashinfer_trtllm (monkeypatch : pytest .MonkeyPatch ):
8596 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_FP8" , "1" )
8697 monkeypatch .setenv ("VLLM_FLASHINFER_MOE_BACKEND" , "latency" )
87- can_initialize ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" )
98+ can_initialize (
99+ "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" , hf_overrides = HF_OVERRIDE_MM
100+ )
88101
89102
90- @pytest .mark .skip (reason = "Works, but takes too long to run" )
91103def test_llama4_nvfp4_moe_flashinfer_cutlass (monkeypatch : pytest .MonkeyPatch ):
92104 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_FP4" , "1" )
93105 monkeypatch .setenv ("VLLM_FLASHINFER_MOE_BACKEND" , "throughput" )
94- can_initialize ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4" )
106+ can_initialize (
107+ "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4" , hf_overrides = HF_OVERRIDE_MM
108+ )
95109
96110
97- @pytest .mark .skip (reason = "RuntimeError: No kernel found for the given options" )
98111def test_llama4_nvfp4_moe_flashinfer_trtllm (monkeypatch : pytest .MonkeyPatch ):
99112 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_FP4" , "1" )
100113 monkeypatch .setenv ("VLLM_FLASHINFER_MOE_BACKEND" , "latency" )
101- can_initialize ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4" )
114+ can_initialize (
115+ "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4" , hf_overrides = HF_OVERRIDE_MM
116+ )
102117
103118
104119## DeepSeekV3 ##
105120
106121
107122def test_deepseek_fp8_block_moe_deep_gemm (monkeypatch : pytest .MonkeyPatch ):
108123 monkeypatch .setenv ("VLLM_USE_DEEP_GEMM" , "1" )
109- can_initialize ("deepseek-ai/DeepSeek-V3.1" )
124+ can_initialize ("deepseek-ai/DeepSeek-V3.1" , hf_overrides = HF_OVERRIDE_TEXT )
110125
111126
112127@pytest .mark .skip (
@@ -118,41 +133,40 @@ def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
118133def test_deepseek_fp8_block_moe_flashinfer_cutlass (monkeypatch : pytest .MonkeyPatch ):
119134 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_FP8" , "1" )
120135 monkeypatch .setenv ("VLLM_FLASHINFER_MOE_BACKEND" , "throughput" )
121- can_initialize ("deepseek-ai/DeepSeek-V3.1" )
136+ can_initialize ("deepseek-ai/DeepSeek-V3.1" , hf_overrides = HF_OVERRIDE_TEXT )
122137
123138
124139def test_deepseek_fp8_block_moe_flashinfer_trtllm (monkeypatch : pytest .MonkeyPatch ):
125140 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_FP8" , "1" )
126141 monkeypatch .setenv ("VLLM_FLASHINFER_MOE_BACKEND" , "latency" )
127- can_initialize ("deepseek-ai/DeepSeek-V3.1" )
142+ can_initialize ("deepseek-ai/DeepSeek-V3.1" , hf_overrides = HF_OVERRIDE_TEXT )
128143
129144
130145def test_deepseek_nvfp4_moe_flashinfer_cutlass (monkeypatch : pytest .MonkeyPatch ):
131146 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_FP4" , "1" )
132147 monkeypatch .setenv ("VLLM_FLASHINFER_MOE_BACKEND" , "throughput" )
133- can_initialize ("nvidia/DeepSeek-R1-0528-FP4-v2" )
148+ can_initialize ("nvidia/DeepSeek-R1-0528-FP4-v2" , hf_overrides = HF_OVERRIDE_TEXT )
134149
135150
136- @pytest .mark .skip (reason = "RuntimeError: No kernel found for the given options" )
137151def test_deepseek_nvfp4_moe_flashinfer_trtllm (monkeypatch : pytest .MonkeyPatch ):
138152 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_FP4" , "1" )
139153 monkeypatch .setenv ("VLLM_FLASHINFER_MOE_BACKEND" , "latency" )
140- can_initialize ("nvidia/DeepSeek-R1-0528-FP4-v2" )
154+ can_initialize ("nvidia/DeepSeek-R1-0528-FP4-v2" , hf_overrides = HF_OVERRIDE_TEXT )
141155
142156
143157## GPT-OSS ##
144158
145159
146160def test_gptoss_mxfp4bf16_moe_flashinfer (monkeypatch : pytest .MonkeyPatch ):
147161 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16" , "1" )
148- can_initialize ("openai/gpt-oss-20b" )
162+ can_initialize ("openai/gpt-oss-20b" , hf_overrides = HF_OVERRIDE_TEXT )
149163
150164
151165def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass (monkeypatch : pytest .MonkeyPatch ):
152166 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS" , "1" )
153- can_initialize ("openai/gpt-oss-20b" )
167+ can_initialize ("openai/gpt-oss-20b" , hf_overrides = HF_OVERRIDE_TEXT )
154168
155169
156170def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm (monkeypatch : pytest .MonkeyPatch ):
157171 monkeypatch .setenv ("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8" , "1" )
158- can_initialize ("openai/gpt-oss-20b" )
172+ can_initialize ("openai/gpt-oss-20b" , hf_overrides = HF_OVERRIDE_TEXT )
0 commit comments