Skip to content

Commit 9e88a2e

Browse files
committed
Remove V0 attention backends
1 parent 0ff8ebb commit 9e88a2e

File tree

28 files changed

+138
-7796
lines changed

28 files changed

+138
-7796
lines changed

examples/offline_inference/qwen_1m.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
from vllm import LLM, SamplingParams
77

8-
os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
98
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
109

1110

tests/compile/test_fusion_attn.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,8 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
335335
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
336336
@pytest.mark.parametrize("model_name, model_class", MODELS)
337337
@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if
338-
current_platform.is_cuda() else [_Backend.ROCM_FLASH])
338+
current_platform.is_cuda()
339+
else [_Backend.TRITON_ATTN_VLLM_V1])
339340
@pytest.mark.parametrize(
340341
"split_attention",
341342
[False, True] if current_platform.is_rocm() else [False])

tests/kernels/attention/test_attention.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from xformers import ops as xops
1919
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
2020

21-
from vllm.attention.backends.xformers import _make_alibi_bias
21+
from tests.kernels.utils import make_alibi_bias
2222

2323
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
2424
# This will change depending on the compute capability.
@@ -429,8 +429,8 @@ def test_multi_query_kv_attention(
429429
alibi_bias = None
430430
if use_alibi:
431431
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
432-
attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
433-
seq_lens)
432+
attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
433+
seq_lens)
434434
output = torch.empty_like(query)
435435
start = 0
436436
# Dynamic sequence length not supported with custom attn_bias.

tests/kernels/attention/test_attention_selector.py

Lines changed: 0 additions & 340 deletions
This file was deleted.

0 commit comments

Comments
 (0)