vllm-project
diff --git a/‎.buildkite/lm-eval-harness/test_lm_eval_correctness.py‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/lm-eval-harness/test_lm_eval_correctness.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 36 additions & 5 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 36 additions & 5 deletions
diff --git a/‎tests/kernels/conftest.py‎ renamed to ‎tests/kernels/attention/conftest.py‎ b/‎tests/kernels/conftest.py‎ renamed to ‎tests/kernels/attention/conftest.py‎
diff --git a/‎tests/kernels/test_attention.py‎ renamed to ‎tests/kernels/attention/test_attention.py‎
Lines changed: 1 addition & 2 deletions b/‎tests/kernels/test_attention.py‎ renamed to ‎tests/kernels/attention/test_attention.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎tests/kernels/test_attention_selector.py‎ renamed to ‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 9 additions & 0 deletions b/‎tests/kernels/test_attention_selector.py‎ renamed to ‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎tests/kernels/test_blocksparse_attention.py‎ renamed to ‎tests/kernels/attention/test_blocksparse_attention.py‎
Lines changed: 1 addition & 2 deletions b/‎tests/kernels/test_blocksparse_attention.py‎ renamed to ‎tests/kernels/attention/test_blocksparse_attention.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎tests/kernels/test_cache.py‎ renamed to ‎tests/kernels/attention/test_cache.py‎ b/‎tests/kernels/test_cache.py‎ renamed to ‎tests/kernels/attention/test_cache.py‎
diff --git a/‎tests/kernels/test_cascade_flash_attn.py‎ renamed to ‎tests/kernels/attention/test_cascade_flash_attn.py‎ b/‎tests/kernels/test_cascade_flash_attn.py‎ renamed to ‎tests/kernels/attention/test_cascade_flash_attn.py‎
diff --git a/‎tests/kernels/test_encoder_decoder_attn.py‎ renamed to ‎tests/kernels/attention/test_encoder_decoder_attn.py‎ b/‎tests/kernels/test_encoder_decoder_attn.py‎ renamed to ‎tests/kernels/attention/test_encoder_decoder_attn.py‎
diff --git a/‎tests/kernels/test_flash_attn.py‎ renamed to ‎tests/kernels/attention/test_flash_attn.py‎ b/‎tests/kernels/test_flash_attn.py‎ renamed to ‎tests/kernels/attention/test_flash_attn.py‎
@@ -16,7 +16,7 @@
 import pytest
 import yaml
 
-RTOL = 0.05
+RTOL = 0.08
 TEST_DATA_FILE = os.environ.get(
     "LM_EVAL_TEST_DATA_FILE",
     ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
 
@@ -317,15 +317,46 @@ steps:
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
+- label: Kernels Core Operation Test
   source_file_dependencies:
   - csrc/
+  - tests/kernels/core
+  commands:
+    - pytest -v -s kernels/core
+
+- label: Kernels Attention Test %N
+  source_file_dependencies:
+  - csrc/attention/
   - vllm/attention
-  - tests/kernels
+  - vllm/v1/attention
+  - tests/kernels/attention
   commands:
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test
+  source_file_dependencies:
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  commands:
+    - pytest -v -s kernels/moe
+
+- label: Kernels Mamba Test
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  commands:
+    - pytest -v -s kernels/mamba
 
 - label: Tensorizer Test # 11min
   # mirror_hardwares: [amd]
 
@@ -6,13 +6,12 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 if not current_platform.is_rocm():
     from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
@@ -156,6 +156,15 @@ def test_env(
                         expected = ("TRITON_MLA_VLLM_V1"
                                     if use_v1 else "TRITON_MLA")
                         assert backend.get_name() == expected
+                elif name == "FLASHINFER":
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
                 else:
                     backend = get_attn_backend(16,
                                                torch.float16,
 
@@ -6,14 +6,13 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm import _custom_ops as ops
 from vllm.attention.ops.blocksparse_attention.interface import (
     LocalStridedBlockSparseAttn)
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer