Skip to content

Commit 428bc7b

Browse files
authored
[V0 deprecation] Remove VLLM_USE_V1 usage in most modules (#27955)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent 878fd5a commit 428bc7b

File tree

19 files changed

+110
-241
lines changed

19 files changed

+110
-241
lines changed

docs/usage/v1_guide.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
88

9-
To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
10-
119
## Why vLLM V1?
1210

1311
vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.

tests/conftest.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -154,26 +154,6 @@ def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
154154
"""Singleton instance of {class}`AudioTestAssets`."""
155155

156156

157-
@pytest.fixture(scope="function", autouse=True)
158-
def cleanup_VLLM_USE_V1(monkeypatch):
159-
"""
160-
The V1 oracle sets "VLLM_USE_V1" during loading. This means
161-
that each invocation of a test change the env variable.
162-
163-
If we touch "VLLM_USE_V1" with monkeypatch, then any changes
164-
made during the test run by vLLM will be cleaned up.
165-
166-
This fixture is used by every test.
167-
"""
168-
169-
# If VLLM_USE_V1 is not set, set then delete. This will
170-
# cause monkeypatch to clean up VLLM_USE_V1 upon exit
171-
# if VLLM modifies the value of envs.VLLM_USE_V1.
172-
if "VLLM_USE_V1" not in os.environ:
173-
monkeypatch.setenv("VLLM_USE_V1", "")
174-
monkeypatch.delenv("VLLM_USE_V1")
175-
176-
177157
@pytest.fixture(autouse=True)
178158
def init_test_http_connection():
179159
# pytest_asyncio may use a different event loop per test

tests/v1/engine/test_async_llm.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -424,15 +424,12 @@ async def test_customize_loggers(monkeypatch):
424424

425425

426426
@pytest.mark.asyncio
427-
async def test_customize_aggregated_loggers(monkeypatch):
427+
async def test_customize_aggregated_loggers():
428428
"""Test that we can customize the aggregated loggers.
429429
If a customized logger is provided at the init, it should
430430
be added to the default loggers.
431431
"""
432-
433-
with monkeypatch.context() as m, ExitStack() as after:
434-
m.setenv("VLLM_USE_V1", "1")
435-
432+
with ExitStack() as after:
436433
with set_default_torch_num_threads(1):
437434
engine = AsyncLLM.from_engine_args(
438435
TEXT_ENGINE_ARGS,

tests/v1/entrypoints/llm/test_struct_output_generate.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -868,11 +868,8 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
868868

869869
@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
870870
def test_structured_output_with_structural_tag(
871-
monkeypatch: pytest.MonkeyPatch,
872871
guided_decoding_backend: str,
873872
):
874-
monkeypatch.setenv("VLLM_USE_V1", "1")
875-
876873
llm = LLM(
877874
model="Qwen/Qwen2.5-1.5B-Instruct",
878875
guided_decoding_backend=guided_decoding_backend,

tests/v1/sample/test_logprobs.py

Lines changed: 59 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,6 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
530530
def test_spec_decode_logprobs(
531531
logprobs_mode: LogprobsMode,
532532
model_setup: tuple[str, str, str],
533-
monkeypatch: pytest.MonkeyPatch,
534533
):
535534
"""Spec decode logprobs should match those of the base model.
536535
@@ -541,64 +540,62 @@ def test_spec_decode_logprobs(
541540
"""
542541
from vllm import LLM
543542

544-
with monkeypatch.context() as m:
545-
m.setenv("VLLM_USE_V1", "1")
546-
prompt = "Hello world"
547-
sampling_params = SamplingParams(
548-
temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
549-
)
550-
method, model_name, spec_model_name = model_setup
551-
max_model_len = 256
552-
553-
# Run base LLM.
554-
ref_llm = LLM(
555-
model=model_name,
556-
max_logprobs=5,
557-
max_model_len=max_model_len,
558-
seed=42,
559-
logprobs_mode=logprobs_mode,
560-
gpu_memory_utilization=0.4,
561-
)
562-
ref_results = ref_llm.generate([prompt], sampling_params)
563-
# Collect logprobs outputs from reference LLM.
564-
ref_logprobs = []
565-
for output in ref_results[0].outputs:
566-
for logprobs in output.logprobs:
567-
for token_id in logprobs:
568-
ref_logprobs.append(logprobs[token_id])
569-
del ref_llm
570-
torch.cuda.empty_cache()
571-
cleanup_dist_env_and_memory()
572-
573-
# Run spec decode LLM.
574-
spec_llm = LLM(
575-
model_name,
576-
speculative_config={
577-
"method": method,
578-
"model": spec_model_name,
579-
"num_speculative_tokens": 3,
580-
"max_model_len": max_model_len,
581-
},
582-
max_logprobs=5,
583-
max_model_len=max_model_len,
584-
seed=42,
585-
logprobs_mode=logprobs_mode,
586-
gpu_memory_utilization=0.4,
587-
)
588-
spec_results = spec_llm.generate([prompt], sampling_params)
589-
# Collect logprobs outputs from spec decode LLM.
590-
spec_logprobs = []
591-
for output in spec_results[0].outputs:
592-
for logprobs in output.logprobs:
593-
for token_id in logprobs:
594-
spec_logprobs.append(logprobs[token_id])
595-
del spec_llm
596-
torch.cuda.empty_cache()
597-
cleanup_dist_env_and_memory()
598-
599-
# Per-token logprobs are expected to be the same.
600-
assert len(ref_logprobs) == len(spec_logprobs)
601-
for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
602-
assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
603-
assert ref_logprob.rank == spec_logprob.rank
604-
assert ref_logprob.decoded_token == spec_logprob.decoded_token
543+
prompt = "Hello world"
544+
sampling_params = SamplingParams(
545+
temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
546+
)
547+
method, model_name, spec_model_name = model_setup
548+
max_model_len = 256
549+
550+
# Run base LLM.
551+
ref_llm = LLM(
552+
model=model_name,
553+
max_logprobs=5,
554+
max_model_len=max_model_len,
555+
seed=42,
556+
logprobs_mode=logprobs_mode,
557+
gpu_memory_utilization=0.4,
558+
)
559+
ref_results = ref_llm.generate([prompt], sampling_params)
560+
# Collect logprobs outputs from reference LLM.
561+
ref_logprobs = []
562+
for output in ref_results[0].outputs:
563+
for logprobs in output.logprobs:
564+
for token_id in logprobs:
565+
ref_logprobs.append(logprobs[token_id])
566+
del ref_llm
567+
torch.cuda.empty_cache()
568+
cleanup_dist_env_and_memory()
569+
570+
# Run spec decode LLM.
571+
spec_llm = LLM(
572+
model_name,
573+
speculative_config={
574+
"method": method,
575+
"model": spec_model_name,
576+
"num_speculative_tokens": 3,
577+
"max_model_len": max_model_len,
578+
},
579+
max_logprobs=5,
580+
max_model_len=max_model_len,
581+
seed=42,
582+
logprobs_mode=logprobs_mode,
583+
gpu_memory_utilization=0.4,
584+
)
585+
spec_results = spec_llm.generate([prompt], sampling_params)
586+
# Collect logprobs outputs from spec decode LLM.
587+
spec_logprobs = []
588+
for output in spec_results[0].outputs:
589+
for logprobs in output.logprobs:
590+
for token_id in logprobs:
591+
spec_logprobs.append(logprobs[token_id])
592+
del spec_llm
593+
torch.cuda.empty_cache()
594+
cleanup_dist_env_and_memory()
595+
596+
# Per-token logprobs are expected to be the same.
597+
assert len(ref_logprobs) == len(spec_logprobs)
598+
for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
599+
assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
600+
assert ref_logprob.rank == spec_logprob.rank
601+
assert ref_logprob.decoded_token == spec_logprob.decoded_token

vllm/attention/layers/chunked_local_attention.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import torch
77

8-
from vllm import envs
98
from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
109
from vllm.attention.selector import get_attn_backend
1110
from vllm.config import CacheConfig
@@ -78,17 +77,12 @@ def __init__(
7877
kv_cache_dtype = "auto"
7978
block_size = 16
8079

81-
if envs.VLLM_USE_V1:
82-
underlying_attn_backend = get_attn_backend(
83-
head_size, dtype, kv_cache_dtype, block_size
84-
)
85-
86-
attn_backend = create_chunked_local_attention_backend(
87-
underlying_attn_backend, attention_chunk_size, block_size
88-
)
89-
else:
90-
# in v0 the local attention is handled inside the backends
91-
attn_backend = None
80+
underlying_attn_backend = get_attn_backend(
81+
head_size, dtype, kv_cache_dtype, block_size
82+
)
83+
attn_backend = create_chunked_local_attention_backend(
84+
underlying_attn_backend, attention_chunk_size, block_size
85+
)
9286

9387
super().__init__(
9488
num_heads=num_heads,

vllm/attention/layers/cross_attention.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import numpy as np
77
import torch
88

9-
from vllm import envs
109
from vllm.attention.backends.abstract import (
1110
AttentionBackend,
1211
AttentionMetadata,
@@ -150,15 +149,10 @@ def __init__(
150149
kv_cache_dtype = "auto"
151150
block_size = 16
152151

153-
if envs.VLLM_USE_V1:
154-
underlying_attn_backend = get_attn_backend(
155-
head_size, dtype, kv_cache_dtype, block_size
156-
)
157-
158-
attn_backend = create_cross_attention_backend(underlying_attn_backend)
159-
else:
160-
# in v0 cross attention is handled inside the backends
161-
attn_backend = None
152+
underlying_attn_backend = get_attn_backend(
153+
head_size, dtype, kv_cache_dtype, block_size
154+
)
155+
attn_backend = create_cross_attention_backend(underlying_attn_backend)
162156

163157
if attn_type is not None:
164158
assert attn_type == AttentionType.ENCODER_DECODER, (

vllm/attention/layers/encoder_only_attention.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import torch
77

8-
from vllm import envs
98
from vllm.attention.backends.abstract import (
109
AttentionBackend,
1110
AttentionMetadata,
@@ -74,17 +73,11 @@ def __init__(
7473
kv_cache_dtype = "auto"
7574
block_size = 16
7675

77-
if envs.VLLM_USE_V1:
78-
underlying_attn_backend = get_attn_backend(
79-
head_size, dtype, kv_cache_dtype, block_size
80-
)
76+
underlying_attn_backend = get_attn_backend(
77+
head_size, dtype, kv_cache_dtype, block_size
78+
)
8179

82-
attn_backend = create_encoder_only_attention_backend(
83-
underlying_attn_backend
84-
)
85-
else:
86-
# in v0 encoder only attention is handled inside the backends
87-
attn_backend = None
80+
attn_backend = create_encoder_only_attention_backend(underlying_attn_backend)
8881

8982
if attn_type is not None:
9083
assert attn_type == AttentionType.ENCODER_ONLY, (

vllm/attention/selector.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -134,16 +134,11 @@ def get_attn_backend(
134134
use_sparse: bool = False,
135135
) -> type[AttentionBackend]:
136136
"""Selects which attention backend to use and lazily imports it."""
137-
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
138-
# value to be returned from the cache if the value changes between calls.
139-
# To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
140-
# private function.
141137
return _cached_get_attn_backend(
142138
head_size=head_size,
143139
dtype=dtype,
144140
kv_cache_dtype=kv_cache_dtype,
145141
block_size=block_size,
146-
use_v1=envs.VLLM_USE_V1,
147142
use_mla=use_mla,
148143
has_sink=has_sink,
149144
use_sparse=use_sparse,
@@ -156,7 +151,6 @@ def _cached_get_attn_backend(
156151
dtype: torch.dtype,
157152
kv_cache_dtype: str | None,
158153
block_size: int,
159-
use_v1: bool = False,
160154
use_mla: bool = False,
161155
has_sink: bool = False,
162156
use_sparse: bool = False,
@@ -199,7 +193,7 @@ def _cached_get_attn_backend(
199193
dtype,
200194
kv_cache_dtype,
201195
block_size,
202-
use_v1,
196+
True,
203197
use_mla,
204198
has_sink,
205199
use_sparse,

vllm/distributed/kv_transfer/kv_connector/factory.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from collections.abc import Callable
66
from typing import TYPE_CHECKING, Optional, cast
77

8-
import vllm.envs as envs
98
from vllm.distributed.kv_transfer.kv_connector.base import (
109
KVConnectorBase,
1110
KVConnectorBaseType,
@@ -47,12 +46,6 @@ def create_connector(
4746
role: KVConnectorRole,
4847
kv_cache_config: Optional["KVCacheConfig"] = None,
4948
) -> KVConnectorBase:
50-
if not envs.VLLM_USE_V1:
51-
raise ValueError(
52-
"Attempting to initialize a V1 Connector, "
53-
f"but found {envs.VLLM_USE_V1=}"
54-
)
55-
5649
kv_transfer_config = config.kv_transfer_config
5750
if kv_transfer_config is None:
5851
raise ValueError("kv_transfer_config must be set to create a connector")

0 commit comments

Comments
 (0)