Skip to content

Commit cb04d93

Browse files
MatthewBonannixuebwang-amd
authored andcommitted
[V0 deprecation] Remove _VLLM_V1 suffixes from attention backend names (vllm-project#25489)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: xuebwang-amd <xuebwang@amd.com>
1 parent 7da7940 commit cb04d93

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+131
-174
lines changed

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ docker run \
3535
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
3636
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
3737
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
38-
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
38+
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
3939
cd tests
4040
pytest -v -s v1/core
4141
pytest -v -s v1/engine

tests/compile/piecewise/test_full_cudagraph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ class BackendConfig:
103103
# Triton Attention
104104
"TritonAttn":
105105
BackendConfig(name="TritonAttn",
106-
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
106+
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
107107
comp_config={
108108
"cudagraph_mode": "FULL",
109109
}),

tests/compile/test_fusion_attn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
338338
@pytest.mark.parametrize("model_name, model_class", MODELS)
339339
@pytest.mark.parametrize("backend",
340340
[_Backend.FLASHINFER] if current_platform.is_cuda()
341-
else [_Backend.TRITON_ATTN_VLLM_V1])
341+
else [_Backend.TRITON_ATTN])
342342
@pytest.mark.parametrize(
343343
"split_attention",
344344
[False, True] if current_platform.is_rocm() else [False])

tests/entrypoints/openai/test_serving_chat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def default_server_args(with_tool_parser: bool):
6868
def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
6969
default_server_args: list[str]):
7070
with monkeypatch_module.context() as m:
71-
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
71+
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
7272
with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
7373
default_server_args) as remote_server:
7474
yield remote_server

tests/kernels/attention/test_attention_selector.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def clear_cache():
3131
}
3232

3333
DEVICE_REGULAR_ATTN_BACKENDS = {
34-
"cuda": ["XFORMERS", "FLASHINFER"],
34+
"cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
3535
"hip": ["ROCM_FLASH"],
3636
"cpu": ["TORCH_SDPA"],
3737
}
@@ -86,7 +86,7 @@ def test_env(
8686
with patch("vllm.attention.selector.current_platform",
8787
CpuPlatform()):
8888
backend = get_attn_backend(16, torch.float16, None, block_size)
89-
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
89+
assert backend.get_name() == "TORCH_SDPA"
9090

9191
elif device == "hip":
9292
with patch("vllm.attention.selector.current_platform",
@@ -125,15 +125,15 @@ def test_env(
125125
None,
126126
block_size,
127127
use_mla=use_mla)
128-
expected = f"{name}_VLLM_V1"
128+
expected = name
129129
assert backend.get_name() == expected
130130
else:
131131
backend = get_attn_backend(16,
132132
torch.float16,
133133
None,
134134
block_size,
135135
use_mla=use_mla)
136-
expected = "TRITON_ATTN_VLLM_V1"
136+
expected = "TRITON_ATTN"
137137
assert backend.get_name() == expected
138138

139139
elif device == "cuda":
@@ -160,7 +160,7 @@ def test_env(
160160
None,
161161
block_size,
162162
use_mla=use_mla)
163-
expected = "CUTLASS_MLA_VLLM_V1"
163+
expected = "CUTLASS_MLA"
164164
assert backend.get_name() == expected
165165
elif name == "FLASHINFER_MLA":
166166
if block_size not in [32, 64]:
@@ -193,7 +193,7 @@ def test_env(
193193
None,
194194
block_size,
195195
use_mla=use_mla)
196-
expected = f"{name}_VLLM_V1"
196+
expected = name
197197
assert backend.get_name() == expected
198198
elif name == "FLASH_ATTN_MLA":
199199
backend = get_attn_backend(16,
@@ -210,33 +210,32 @@ def test_env(
210210
None,
211211
block_size,
212212
use_mla=use_mla)
213-
expected = "TRITON_MLA_VLLM_V1"
213+
expected = "TRITON_MLA"
214214
assert backend.get_name() == expected
215215
elif name == "FLASHINFER":
216216
backend = get_attn_backend(16,
217217
torch.float16,
218218
None,
219219
block_size,
220220
use_mla=use_mla)
221-
expected = "FLASHINFER_VLLM_V1"
221+
expected = "FLASHINFER"
222222
assert backend.get_name() == expected
223-
else:
223+
elif name == "XFORMERS":
224224
backend = get_attn_backend(32,
225225
torch.float16,
226226
None,
227227
block_size,
228228
use_mla=use_mla)
229-
expected = "FLASH_ATTN_VLLM_V1"
229+
expected = "XFORMERS"
230230
assert backend.get_name() == expected
231-
232-
backend = get_attn_backend(16,
231+
elif name == "FLASH_ATTN":
232+
backend = get_attn_backend(32,
233233
torch.float16,
234234
None,
235235
block_size,
236236
use_mla=use_mla)
237-
assert backend.get_name() == "FLEX_ATTENTION", (
238-
"Should fallback to FlexAttention if head size is "
239-
"not supported by FlashAttention")
237+
expected = "FLASH_ATTN"
238+
assert backend.get_name() == expected
240239

241240

242241
@pytest.mark.parametrize("device", ["cpu", "cuda"])
@@ -252,7 +251,7 @@ def test_fp32_fallback(
252251
with patch("vllm.attention.selector.current_platform",
253252
CpuPlatform()):
254253
backend = get_attn_backend(16, torch.float32, None, 16)
255-
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
254+
assert backend.get_name() == "TORCH_SDPA"
256255

257256
elif device == "cuda":
258257
with patch("vllm.attention.selector.current_platform",
@@ -266,6 +265,9 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
266265
# TODO: When testing for v1, pipe in `use_v1` as an argument to
267266
# get_attn_backend
268267

268+
pytest.skip("Skipping as current backend selector does not " \
269+
"handle fallbacks when a backend is set via env var.")
270+
269271
with monkeypatch.context() as m:
270272
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
271273

tests/kernels/attention/test_rocm_attention_selector.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
2828
# Test standard ROCm attention
2929
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
3030
assert (backend.get_name() == "ROCM_FLASH"
31-
or backend.get_name() == "TRITON_ATTN_VLLM_V1")
31+
or backend.get_name() == "TRITON_ATTN")
3232

3333
# MLA test for deepseek related
3434

@@ -40,8 +40,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
4040
16,
4141
False,
4242
use_mla=True)
43-
assert (backend.get_name() == "TRITON_MLA"
44-
or backend.get_name() == "TRITON_MLA_VLLM_V1")
43+
assert backend.get_name() == "TRITON_MLA"
4544

4645
# If attention backend is None
4746
# If use_mla is true
@@ -53,8 +52,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
5352
16,
5453
False,
5554
use_mla=True)
56-
assert (backend.get_name() == "TRITON_MLA"
57-
or backend.get_name() == "TRITON_MLA_VLLM_V1")
55+
assert backend.get_name() == "TRITON_MLA"
5856

5957
# change the attention backend to AITER MLA
6058
m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
@@ -64,8 +62,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
6462
1,
6563
False,
6664
use_mla=True)
67-
assert (backend.get_name() == "ROCM_AITER_MLA"
68-
or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
65+
assert backend.get_name() == "ROCM_AITER_MLA"
6966

7067
# If attention backend is None
7168
# If use_mla is true
@@ -79,5 +76,4 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
7976
1,
8077
False,
8178
use_mla=True)
82-
assert (backend.get_name() == "ROCM_AITER_MLA"
83-
or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
79+
assert backend.get_name() == "ROCM_AITER_MLA"

tests/kernels/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -524,22 +524,22 @@ def make_backend(backend_name: str) -> AttentionBackend:
524524
525525
* Backend instance
526526
'''
527-
if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"):
527+
if backend_name == STR_XFORMERS_ATTN_VAL:
528528
from vllm.v1.attention.backends.xformers import (
529529
XFormersAttentionBackend)
530530
return XFormersAttentionBackend()
531-
if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"):
531+
if backend_name == STR_FLASH_ATTN_VAL:
532532
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
533533
return FlashAttentionBackend()
534-
if backend_name == "TRITON_ATTN_VLLM_V1":
534+
if backend_name == "TRITON_ATTN":
535535
from vllm.v1.attention.backends.triton_attn import (
536536
TritonAttentionBackend)
537537
return TritonAttentionBackend()
538538
if backend_name == "FLEX_ATTENTION":
539539
from vllm.v1.attention.backends.flex_attention import (
540540
FlexAttentionBackend)
541541
return FlexAttentionBackend()
542-
if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"):
542+
if backend_name == "TORCH_SDPA":
543543
from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
544544
return TorchSDPABackend()
545545
if backend_name == "FLASHINFER":

tests/models/test_initialization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def _initialize_kv_caches_v1(self, vllm_config):
8484
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
8585
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
8686
# L4 supports FA3.
87-
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
87+
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
8888
if model_arch == "WhisperForConditionalGeneration":
8989
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
9090
LLM(

tests/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1131,14 +1131,14 @@ def has_module_attribute(module_name, attribute_name):
11311131

11321132
def get_attn_backend_list_based_on_platform() -> list[str]:
11331133
if current_platform.is_cuda():
1134-
return ["FLASH_ATTN_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TREE_ATTN"]
1134+
return ["FLASH_ATTN", "TRITON_ATTN", "TREE_ATTN"]
11351135
elif current_platform.is_rocm():
1136-
attn_backend_list = ["TRITON_ATTN_VLLM_V1"]
1136+
attn_backend_list = ["TRITON_ATTN"]
11371137
try:
11381138
import aiter # noqa: F401
1139-
attn_backend_list.append("FLASH_ATTN_VLLM_V1")
1139+
attn_backend_list.append("FLASH_ATTN")
11401140
except Exception:
1141-
print("Skip FLASH_ATTN_VLLM_V1 on ROCm as aiter is not installed")
1141+
print("Skip FLASH_ATTN on ROCm as aiter is not installed")
11421142

11431143
return attn_backend_list
11441144
else:

tests/v1/attention/test_attention_backends.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,15 @@
2121
from vllm.v1.kv_cache_interface import FullAttentionSpec
2222

2323
BACKENDS_TO_TEST = [
24-
_Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
25-
_Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1, _Backend.TREE_ATTN,
26-
"FLEX_ATTENTION_SLOW"
24+
_Backend.FLASH_ATTN, _Backend.FLASHINFER, _Backend.FLEX_ATTENTION,
25+
_Backend.TRITON_ATTN, _Backend.TREE_ATTN, "FLEX_ATTENTION_SLOW"
2726
]
2827

2928
# Remove flashinfer from the list if it's not available
3029
try:
3130
import flashinfer # noqa: F401
3231
except ImportError:
33-
BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_VLLM_V1)
32+
BACKENDS_TO_TEST.remove(_Backend.FLASHINFER)
3433

3534

3635
def _convert_dtype_to_torch(dtype):
@@ -214,7 +213,7 @@ def run_attention_backend(
214213
builder_cls, impl_cls = get_attention_backend(actual_backend)
215214

216215
# Mock flashinfer's get_per_layer_parameters if needed
217-
if actual_backend == _Backend.FLASHINFER_VLLM_V1:
216+
if actual_backend == _Backend.FLASHINFER:
218217
import unittest.mock
219218

220219
from vllm.v1.attention.backends.utils import PerLayerParameters
@@ -434,7 +433,7 @@ def _test_backend_correctness(
434433
# [num_blocks, 2, block_size, num_kv_heads, head_size]
435434
# Select the appropriate KV cache format for each backend
436435
kv_cache_for_backend = kv_cache
437-
if backend_name == _Backend.FLASHINFER_VLLM_V1:
436+
if backend_name == _Backend.FLASHINFER:
438437
kv_cache_for_backend = kv_cache.transpose(0, 1)
439438

440439
# For FlashInfer default to HND layout and
@@ -518,8 +517,8 @@ def causal_mask_mod(
518517

519518

520519
SLIDING_WINDOW_BACKENDS_TO_TEST = [
521-
_Backend.FLASH_ATTN_VLLM_V1, _Backend.FLEX_ATTENTION,
522-
_Backend.TRITON_ATTN_VLLM_V1, "FLEX_ATTENTION_SLOW"
520+
_Backend.FLASH_ATTN, _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN,
521+
"FLEX_ATTENTION_SLOW"
523522
]
524523

525524

0 commit comments

Comments
 (0)