Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/scripts/hardware_ci/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/piecewise/test_full_cudagraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class BackendConfig:
# Triton Attention
"TritonAttn":
BackendConfig(name="TritonAttn",
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
comp_config={
"cudagraph_mode": "FULL",
}),
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_fusion_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
@pytest.mark.parametrize("model_name, model_class", MODELS)
@pytest.mark.parametrize("backend",
[_Backend.FLASHINFER] if current_platform.is_cuda()
else [_Backend.TRITON_ATTN_VLLM_V1])
else [_Backend.TRITON_ATTN])
@pytest.mark.parametrize(
"split_attention",
[False, True] if current_platform.is_rocm() else [False])
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def default_server_args(with_tool_parser: bool):
def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
default_server_args: list[str]):
with monkeypatch_module.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
default_server_args) as remote_server:
yield remote_server
Expand Down
34 changes: 18 additions & 16 deletions tests/kernels/attention/test_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def clear_cache():
}

DEVICE_REGULAR_ATTN_BACKENDS = {
"cuda": ["XFORMERS", "FLASHINFER"],
"cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
"hip": ["ROCM_FLASH"],
"cpu": ["TORCH_SDPA"],
}
Expand Down Expand Up @@ -86,7 +86,7 @@ def test_env(
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float16, None, block_size)
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
assert backend.get_name() == "TORCH_SDPA"

elif device == "hip":
with patch("vllm.attention.selector.current_platform",
Expand Down Expand Up @@ -125,15 +125,15 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = f"{name}_VLLM_V1"
expected = name
assert backend.get_name() == expected
else:
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
expected = "TRITON_ATTN_VLLM_V1"
expected = "TRITON_ATTN"
assert backend.get_name() == expected

elif device == "cuda":
Expand All @@ -160,7 +160,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "CUTLASS_MLA_VLLM_V1"
expected = "CUTLASS_MLA"
assert backend.get_name() == expected
elif name == "FLASHINFER_MLA":
if block_size not in [32, 64]:
Expand Down Expand Up @@ -193,7 +193,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = f"{name}_VLLM_V1"
expected = name
assert backend.get_name() == expected
elif name == "FLASH_ATTN_MLA":
backend = get_attn_backend(16,
Expand All @@ -210,33 +210,32 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "TRITON_MLA_VLLM_V1"
expected = "TRITON_MLA"
assert backend.get_name() == expected
elif name == "FLASHINFER":
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
expected = "FLASHINFER_VLLM_V1"
expected = "FLASHINFER"
assert backend.get_name() == expected
else:
elif name == "XFORMERS":
backend = get_attn_backend(32,
torch.float16,
None,
block_size,
use_mla=use_mla)
expected = "FLASH_ATTN_VLLM_V1"
expected = "XFORMERS"
assert backend.get_name() == expected

backend = get_attn_backend(16,
elif name == "FLASH_ATTN":
backend = get_attn_backend(32,
torch.float16,
None,
block_size,
use_mla=use_mla)
assert backend.get_name() == "FLEX_ATTENTION", (
"Should fallback to FlexAttention if head size is "
"not supported by FlashAttention")
expected = "FLASH_ATTN"
assert backend.get_name() == expected


@pytest.mark.parametrize("device", ["cpu", "cuda"])
Expand All @@ -252,7 +251,7 @@ def test_fp32_fallback(
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
assert backend.get_name() == "TORCH_SDPA"

elif device == "cuda":
with patch("vllm.attention.selector.current_platform",
Expand All @@ -266,6 +265,9 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend

pytest.skip("Skipping as current backend selector does not " \
"handle fallbacks when a backend is set via env var.")

with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)

Expand Down
14 changes: 5 additions & 9 deletions tests/kernels/attention/test_rocm_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
# Test standard ROCm attention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "TRITON_ATTN_VLLM_V1")
or backend.get_name() == "TRITON_ATTN")

# MLA test for deepseek related

Expand All @@ -40,8 +40,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
16,
False,
use_mla=True)
assert (backend.get_name() == "TRITON_MLA"
or backend.get_name() == "TRITON_MLA_VLLM_V1")
assert backend.get_name() == "TRITON_MLA"

# If attention backend is None
# If use_mla is true
Expand All @@ -53,8 +52,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
16,
False,
use_mla=True)
assert (backend.get_name() == "TRITON_MLA"
or backend.get_name() == "TRITON_MLA_VLLM_V1")
assert backend.get_name() == "TRITON_MLA"

# change the attention backend to AITER MLA
m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
Expand All @@ -64,8 +62,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
1,
False,
use_mla=True)
assert (backend.get_name() == "ROCM_AITER_MLA"
or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
assert backend.get_name() == "ROCM_AITER_MLA"

# If attention backend is None
# If use_mla is true
Expand All @@ -79,5 +76,4 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
1,
False,
use_mla=True)
assert (backend.get_name() == "ROCM_AITER_MLA"
or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
assert backend.get_name() == "ROCM_AITER_MLA"
8 changes: 4 additions & 4 deletions tests/kernels/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,22 +524,22 @@ def make_backend(backend_name: str) -> AttentionBackend:

* Backend instance
'''
if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"):
if backend_name == STR_XFORMERS_ATTN_VAL:
from vllm.v1.attention.backends.xformers import (
XFormersAttentionBackend)
return XFormersAttentionBackend()
if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"):
if backend_name == STR_FLASH_ATTN_VAL:
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
return FlashAttentionBackend()
if backend_name == "TRITON_ATTN_VLLM_V1":
if backend_name == "TRITON_ATTN":
from vllm.v1.attention.backends.triton_attn import (
TritonAttentionBackend)
return TritonAttentionBackend()
if backend_name == "FLEX_ATTENTION":
from vllm.v1.attention.backends.flex_attention import (
FlexAttentionBackend)
return FlexAttentionBackend()
if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"):
if backend_name == "TORCH_SDPA":
from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
return TorchSDPABackend()
if backend_name == "FLASHINFER":
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def _initialize_kv_caches_v1(self, vllm_config):
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
if model_arch == "WhisperForConditionalGeneration":
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
LLM(
Expand Down
8 changes: 4 additions & 4 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,14 +1131,14 @@ def has_module_attribute(module_name, attribute_name):

def get_attn_backend_list_based_on_platform() -> list[str]:
if current_platform.is_cuda():
return ["FLASH_ATTN_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TREE_ATTN"]
return ["FLASH_ATTN", "TRITON_ATTN", "TREE_ATTN"]
elif current_platform.is_rocm():
attn_backend_list = ["TRITON_ATTN_VLLM_V1"]
attn_backend_list = ["TRITON_ATTN"]
try:
import aiter # noqa: F401
attn_backend_list.append("FLASH_ATTN_VLLM_V1")
attn_backend_list.append("FLASH_ATTN")
except Exception:
print("Skip FLASH_ATTN_VLLM_V1 on ROCm as aiter is not installed")
print("Skip FLASH_ATTN on ROCm as aiter is not installed")

return attn_backend_list
else:
Expand Down
15 changes: 7 additions & 8 deletions tests/v1/attention/test_attention_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,15 @@
from vllm.v1.kv_cache_interface import FullAttentionSpec

BACKENDS_TO_TEST = [
_Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
_Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1, _Backend.TREE_ATTN,
"FLEX_ATTENTION_SLOW"
_Backend.FLASH_ATTN, _Backend.FLASHINFER, _Backend.FLEX_ATTENTION,
_Backend.TRITON_ATTN, _Backend.TREE_ATTN, "FLEX_ATTENTION_SLOW"
]

# Remove flashinfer from the list if it's not available
try:
import flashinfer # noqa: F401
except ImportError:
BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_VLLM_V1)
BACKENDS_TO_TEST.remove(_Backend.FLASHINFER)


def _convert_dtype_to_torch(dtype):
Expand Down Expand Up @@ -214,7 +213,7 @@ def run_attention_backend(
builder_cls, impl_cls = get_attention_backend(actual_backend)

# Mock flashinfer's get_per_layer_parameters if needed
if actual_backend == _Backend.FLASHINFER_VLLM_V1:
if actual_backend == _Backend.FLASHINFER:
import unittest.mock

from vllm.v1.attention.backends.utils import PerLayerParameters
Expand Down Expand Up @@ -434,7 +433,7 @@ def _test_backend_correctness(
# [num_blocks, 2, block_size, num_kv_heads, head_size]
# Select the appropriate KV cache format for each backend
kv_cache_for_backend = kv_cache
if backend_name == _Backend.FLASHINFER_VLLM_V1:
if backend_name == _Backend.FLASHINFER:
kv_cache_for_backend = kv_cache.transpose(0, 1)

# For FlashInfer default to HND layout and
Expand Down Expand Up @@ -518,8 +517,8 @@ def causal_mask_mod(


SLIDING_WINDOW_BACKENDS_TO_TEST = [
_Backend.FLASH_ATTN_VLLM_V1, _Backend.FLEX_ATTENTION,
_Backend.TRITON_ATTN_VLLM_V1, "FLEX_ATTENTION_SLOW"
_Backend.FLASH_ATTN, _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN,
"FLEX_ATTENTION_SLOW"
]


Expand Down
4 changes: 2 additions & 2 deletions tests/v1/attention/test_mla_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from vllm.v1.kv_cache_interface import FullAttentionSpec

BACKENDS_TO_TEST = [
_Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, _Backend.FLASH_ATTN_MLA,
_Backend.TRITON_MLA_VLLM_V1
_Backend.CUTLASS_MLA, _Backend.FLASHMLA, _Backend.FLASH_ATTN_MLA,
_Backend.TRITON_MLA
]

# Remove CUTLASS_MLA from the list if not using sm100
Expand Down
12 changes: 6 additions & 6 deletions tests/v1/attention/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,30 +120,30 @@ def get_attention_backend(backend_name: _Backend):
Tuple of (backend_builder_class, backend_impl_class)
"""
backend_map = {
_Backend.FLASH_ATTN_VLLM_V1:
_Backend.FLASH_ATTN:
("vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
if current_platform.is_cuda() else
"vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
),
_Backend.FLASHINFER_VLLM_V1:
_Backend.FLASHINFER:
"vllm.v1.attention.backends.flashinfer.FlashInferBackend",
_Backend.FLEX_ATTENTION:
"vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
_Backend.TRITON_ATTN_VLLM_V1:
_Backend.TRITON_ATTN:
"vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
_Backend.TREE_ATTN:
"vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
_Backend.XFORMERS_VLLM_V1:
_Backend.XFORMERS:
"vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
_Backend.CUTLASS_MLA:
"vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
_Backend.FLASHMLA_VLLM_V1:
_Backend.FLASHMLA:
"vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
_Backend.FLASH_ATTN_MLA:
"vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
_Backend.FLASHINFER_MLA:
"vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
_Backend.TRITON_MLA_VLLM_V1:
_Backend.TRITON_MLA:
"vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
}

Expand Down
2 changes: 1 addition & 1 deletion tests/v1/cudagraph/test_cudagraph_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class BackendConfig:
# Triton Attention
"TritonAttn":
BackendConfig(name="TritonAttn",
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
}),
Expand Down
7 changes: 5 additions & 2 deletions tests/v1/e2e/test_cascade_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@


@create_new_process_for_each_test()
@pytest.mark.parametrize("attn_backend",
["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"

if attn_backend == "FLASHINFER":
pytest.skip("This test is failing with FlashInfer backend and "
"needs investigation. See issue #25679.")

with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
Expand Down
7 changes: 3 additions & 4 deletions tests/v1/e2e/test_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,11 @@ def test_eagle_correctness(
m.setenv("VLLM_MLA_DISABLE", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

if (attn_backend == "TRITON_ATTN_VLLM_V1"
and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform")

if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
m.setenv("VLLM_ROCM_USE_AITER", "1")

method, model_name, spec_model_name, tp_size = model_setup
Expand Down
Loading