Skip to content

Commit 109a01a

Browse files
authored
Merge branch 'main' into elvischenv/update-flashinfer
2 parents 44cc643 + eb577e4 commit 109a01a

File tree

74 files changed

+2598
-1709
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+2598
-1709
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ steps:
296296
- tests/v1
297297
commands:
298298
# split the test to avoid interference
299+
- pytest -v -s -m 'not cpu_test' v1/core
299300
- pytest -v -s v1/executor
300301
- pytest -v -s v1/kv_offload
301302
- pytest -v -s v1/sample
@@ -317,7 +318,7 @@ steps:
317318
no_gpu: true
318319
commands:
319320
# split the test to avoid interference
320-
- pytest -v -s v1/core
321+
- pytest -v -s -m 'cpu_test' v1/core
321322
- pytest -v -s v1/structured_output
322323
- pytest -v -s v1/test_serial_utils.py
323324
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
@@ -867,7 +868,7 @@ steps:
867868
- pytest -s -v tests/quantization/test_blackwell_moe.py
868869

869870
- label: Blackwell LM Eval Small Models
870-
timeout_in_minutes: 75
871+
timeout_in_minutes: 120
871872
gpu: b200
872873
optional: true # run on nightlies
873874
source_file_dependencies:

docs/models/supported_models.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,7 @@ th {
390390
| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
391391
| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
392392
| `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | ✅︎ |
393+
| `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
393394
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
394395
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | ✅︎ |
395396
| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |
@@ -576,6 +577,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
576577
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
577578
|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
578579
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ |
580+
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | ✅︎ |
579581

580582
!!! note
581583
Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.

tests/basic_correctness/test_basic_correctness.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import torch
1414

1515
from vllm import LLM
16-
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
16+
from vllm.v1.engine.llm_engine import LLMEngine
1717

1818
from ..conftest import HfRunner, VllmRunner
1919
from ..models.utils import check_outputs_equal
@@ -211,16 +211,11 @@ def test_models_distributed(
211211

212212

213213
def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
214-
from vllm.envs import VLLM_USE_V1
215-
216-
if not VLLM_USE_V1:
217-
pytest.skip("Skipping V0 test, dump input not supported")
218-
219214
# Needed to mock an error in the same process
220215
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
221216

222217
with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
223-
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
218+
if isinstance(vllm_model.llm.llm_engine, LLMEngine):
224219
v1_test_failed_model_execution(vllm_model)
225220

226221

tests/basic_correctness/test_cumem.py

Lines changed: 48 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -117,68 +117,59 @@ def model(x):
117117

118118
@create_new_process_for_each_test()
119119
@pytest.mark.parametrize(
120-
"model, use_v1",
120+
"model",
121121
[
122122
# sleep mode with safetensors
123-
("meta-llama/Llama-3.2-1B", True),
123+
"meta-llama/Llama-3.2-1B",
124124
# sleep mode with pytorch checkpoint
125-
("facebook/opt-125m", True),
125+
"facebook/opt-125m",
126126
],
127127
)
128-
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
129-
with monkeypatch.context() as m:
130-
assert use_v1
131-
m.setenv("VLLM_USE_V1", "1")
132-
free, total = torch.cuda.mem_get_info()
133-
used_bytes_baseline = total - free # in case other process is running
134-
llm = LLM(model, enable_sleep_mode=True)
135-
prompt = "How are you?"
136-
sampling_params = SamplingParams(temperature=0, max_tokens=10)
137-
output = llm.generate(prompt, sampling_params)
138-
139-
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
140-
# which is difficult to measure in the test. therefore, we only
141-
# test sleep level 1 here.
142-
llm.sleep(level=1)
143-
144-
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
145-
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
146-
# now the memory usage is mostly cudagraph memory pool,
147-
# and it should be less than the model weights (1B model, 2GiB weights)
148-
149-
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
150-
# is captured but cannot be releasesd from PyTorch due to a known bug,
151-
# therefore high memory usage after `llm.sleep` is called is expected.
152-
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
153-
# in V1.
154-
if use_v1:
155-
assert used_bytes < 7 * GiB_bytes
156-
else:
157-
assert used_bytes < 2 * GiB_bytes
158-
159-
llm.wake_up()
160-
output2 = llm.generate(prompt, sampling_params)
161-
# cmp output
162-
assert output[0].outputs[0].text == output2[0].outputs[0].text
163-
164-
llm.sleep(level=1)
165-
llm.wake_up(tags=["weights"])
166-
167-
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
168-
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
169-
170-
# should just reallocate memory for weights (1B model, ~2GiB weights)
171-
if use_v1:
172-
assert used_bytes < 10 * GiB_bytes
173-
else:
174-
assert used_bytes < 6 * GiB_bytes
175-
176-
# now allocate kv cache memory
177-
llm.wake_up(tags=["kv_cache"])
178-
output3 = llm.generate(prompt, sampling_params)
179-
180-
# cmp output
181-
assert output[0].outputs[0].text == output3[0].outputs[0].text
128+
def test_end_to_end(model: str):
129+
free, total = torch.cuda.mem_get_info()
130+
used_bytes_baseline = total - free # in case other process is running
131+
llm = LLM(model, enable_sleep_mode=True)
132+
prompt = "How are you?"
133+
sampling_params = SamplingParams(temperature=0, max_tokens=10)
134+
output = llm.generate(prompt, sampling_params)
135+
136+
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
137+
# which is difficult to measure in the test. therefore, we only
138+
# test sleep level 1 here.
139+
llm.sleep(level=1)
140+
141+
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
142+
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
143+
# now the memory usage is mostly cudagraph memory pool,
144+
# and it should be less than the model weights (1B model, 2GiB weights)
145+
146+
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
147+
# is captured but cannot be releasesd from PyTorch due to a known bug,
148+
# therefore high memory usage after `llm.sleep` is called is expected.
149+
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
150+
# in V1.
151+
assert used_bytes < 7 * GiB_bytes
152+
153+
llm.wake_up()
154+
output2 = llm.generate(prompt, sampling_params)
155+
# cmp output
156+
assert output[0].outputs[0].text == output2[0].outputs[0].text
157+
158+
llm.sleep(level=1)
159+
llm.wake_up(tags=["weights"])
160+
161+
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
162+
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
163+
164+
# should just reallocate memory for weights (1B model, ~2GiB weights)
165+
assert used_bytes < 10 * GiB_bytes
166+
167+
# now allocate kv cache memory
168+
llm.wake_up(tags=["kv_cache"])
169+
output3 = llm.generate(prompt, sampling_params)
170+
171+
# cmp output
172+
assert output[0].outputs[0].text == output3[0].outputs[0].text
182173

183174

184175
@create_new_process_for_each_test()

tests/compile/piecewise/test_full_cudagraph.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ def llm_pair(request):
6666
pytest.skip("Only Blackwell GPUs support Cutlass MLA")
6767

6868
env_vars = {
69-
"VLLM_USE_V1": "1",
7069
# Force native sampler to avoid potential nondeterminism in FlashInfer
7170
# when per-request generators are not used in V1.
7271
"VLLM_USE_FLASHINFER_SAMPLER": "0",
@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
161160
with (
162161
temporary_environ(
163162
{
164-
"VLLM_USE_V1": "1",
165163
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
166164
# Flex_Attention is not supported with full cuda graph
167165
}

tests/compile/piecewise/test_simple.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
VllmConfig,
1919
set_current_vllm_config,
2020
)
21-
from vllm.envs import VLLM_USE_V1
2221
from vllm.forward_context import BatchDescriptor, set_forward_context
2322
from vllm.utils import is_torch_equal_or_newer
2423

@@ -127,7 +126,6 @@ def _run_simple_model(
127126
@pytest.mark.parametrize("use_inductor", [True, False])
128127
@torch.inference_mode()
129128
def test_simple_piecewise_compile(use_inductor):
130-
assert VLLM_USE_V1
131129
_run_simple_model(
132130
splitting_ops=["silly.attention"],
133131
use_inductor_graph_partition=False,
@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
146144
@torch.inference_mode()
147145
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
148146
def test_simple_inductor_graph_partition(splitting_ops):
149-
assert VLLM_USE_V1
150147
if not is_torch_equal_or_newer("2.9.0.dev"):
151148
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
152149

tests/compile/test_async_tp.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
388388
"pass_config": {"enable_async_tp": async_tp_enabled},
389389
}
390390

391-
async_tp_env = tp_env = {
392-
"VLLM_USE_V1": "1",
393-
}
394-
395391
async_tp_args = [
396392
*common_args,
397393
"--tensor-parallel-size",
@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
410406
"mp",
411407
]
412408

413-
compare_two_settings(
414-
model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
415-
)
409+
compare_two_settings(model_id, async_tp_args, tp_args, method="generate")

tests/compile/test_config.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
import pytest
44

5-
import vllm
65
from vllm.compilation.counter import compilation_counter
76
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
87
from vllm.utils import _is_torch_equal_or_newer
@@ -16,15 +15,10 @@ def test_version():
1615
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
1716

1817

19-
def test_use_cudagraphs_dynamic(monkeypatch):
20-
assert vllm.envs.VLLM_USE_V1
18+
def test_use_cudagraphs_dynamic():
2119
vllm_config = VllmConfig()
2220
assert vllm_config.compilation_config.use_cudagraph
2321

24-
monkeypatch.setenv("VLLM_USE_V1", "0")
25-
vllm_config = VllmConfig()
26-
assert not vllm_config.compilation_config.use_cudagraph
27-
2822

2923
def test_custom_op():
3024
# proper syntax
@@ -41,8 +35,6 @@ def test_custom_op():
4135
# may be influenced by other tests.
4236
@pytest.mark.parametrize("val", ["1"])
4337
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
44-
assert vllm.envs.VLLM_USE_V1
45-
4638
# Disable multiprocessing so that the counter is in the same process
4739
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
4840
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
6860
@pytest.mark.forked
6961
@pytest.mark.parametrize("enabled", [True, False])
7062
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
71-
assert vllm.envs.VLLM_USE_V1
72-
7363
# Disable multiprocessing so that the counter is in the same process
7464
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
7565

tests/compile/test_fusion_attn.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
303303
model_class: type[AttentionQuantPatternModel],
304304
backend: _Backend,
305305
use_inductor_graph_partition: bool,
306-
monkeypatch,
307306
dist_init,
308307
caplog_vllm,
309308
):
@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
312311
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
313312
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
314313

315-
monkeypatch.setenv("VLLM_USE_V1", "1")
316-
317314
device = torch.device("cuda:0")
318315
torch.manual_seed(42)
319316

tests/config/test_mp_reducer.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,13 @@
88
from vllm.v1.engine.async_llm import AsyncLLM
99

1010

11-
def test_mp_reducer(monkeypatch):
11+
def test_mp_reducer():
1212
"""
1313
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
1414
without transformers_modules. This is a regression test for
1515
https://github.com/vllm-project/vllm/pull/18640.
1616
"""
1717

18-
# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
19-
monkeypatch.setenv("VLLM_USE_V1", "1")
20-
2118
# Ensure transformers_modules is not in sys.modules
2219
if "transformers_modules" in sys.modules:
2320
del sys.modules["transformers_modules"]

0 commit comments

Comments
 (0)