vllm-project
diff --git a/‎examples/offline_inference/qwen_1m.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/offline_inference/qwen_1m.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/compile/test_fusion_attn.py‎
Lines changed: 2 additions & 1 deletion b/‎tests/compile/test_fusion_attn.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/kernels/attention/test_attention.py‎
Lines changed: 3 additions & 3 deletions b/‎tests/kernels/attention/test_attention.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 0 additions & 340 deletions b/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 0 additions & 340 deletions
@@ -5,7 +5,6 @@
 
 from vllm import LLM, SamplingParams
 
-os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
 os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
 
 
 
@@ -335,7 +335,8 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("model_name, model_class", MODELS)
 @pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if
-                         current_platform.is_cuda() else [_Backend.ROCM_FLASH])
+                         current_platform.is_cuda()
+                         else [_Backend.TRITON_ATTN_VLLM_V1])
 @pytest.mark.parametrize(
     "split_attention",
     [False, True] if current_platform.is_rocm() else [False])
 
@@ -18,7 +18,7 @@
     from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
-    from vllm.attention.backends.xformers import _make_alibi_bias
+    from tests.kernels.utils import make_alibi_bias
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
@@ -429,8 +429,8 @@ def test_multi_query_kv_attention(
     alibi_bias = None
     if use_alibi:
         alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-        attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
-                                     seq_lens)
+        attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
+                                    seq_lens)
         output = torch.empty_like(query)
         start = 0
         # Dynamic sequence length not supported with custom attn_bias.
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`
`6`	`6`	`from vllm import LLM, SamplingParams`
`7`	`7`
`8`		`-os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"`
`9`	`8`	`os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"`
`10`	`9`
`11`	`10`