add sinks attn unit tests

elvischenv · elvischenv · commit ac1611cbad88 · 2025-09-26T00:19:40.000-07:00
Signed-off-by: elvischenv &lt;219235043+elvischenv@users.noreply.github.com&gt;
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -6,9 +6,8 @@
 import pytest
 import torch
 
-from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
-                                                    FLOAT8_E4M3_MAX,
-                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.quantization.nvfp4_utils import (dequantize_nvfp4_to_dtype,
+                                                    get_nvfp4_global_scale)
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
@@ -47,6 +46,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 BLOCK_SIZE = [16]
 WINDOW_LEFT = [-1, 127]
 SOFT_CAP = [None, 50.0]
+HAS_SINKS = [True, False]
 
 NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
 
@@ -61,6 +61,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 @pytest.mark.parametrize("block_size", BLOCK_SIZE)
 @pytest.mark.parametrize("window_left", WINDOW_LEFT)
 @pytest.mark.parametrize("soft_cap", SOFT_CAP)
+@pytest.mark.parametrize("has_sinks", HAS_SINKS)
 @torch.inference_mode
 def test_flashinfer_trtllm_decode_with_baseline(
     dtype: torch.dtype,
@@ -74,9 +75,10 @@ def test_flashinfer_trtllm_decode_with_baseline(
     block_size: int,
     window_left: int,
     soft_cap: Optional[float],
+    has_sinks: bool,
 ) -> None:
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    current_platform.seed_everything(42)
 
     q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
     q_quant_dtype = q_quant_dtype or dtype
@@ -98,7 +100,17 @@ def test_flashinfer_trtllm_decode_with_baseline(
     else:
         raise ValueError(f"Invalid kv_layout: {kv_layout}")
 
-    query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
+    # max_q_len = 1
+    q_lens = torch.ones((batch_size, ), dtype=torch.int32)
+    q_indptr = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        torch.cumsum(q_lens, dim=0, dtype=torch.int32),
+    ])
+
+    query = torch.randn(torch.sum(q_lens).item(),
+                        num_qo_heads,
+                        head_size,
+                        dtype=dtype)
     if q_quant_dtype == FP8_DTYPE:
         query, q_scale = to_float8(query)
         ref_query = query.to(dtype) * q_scale
@@ -109,7 +121,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
     kv_lens = torch.randint(1, max_kv_len, (batch_size, ), dtype=torch.int32)
     kv_lens[-1] = max_kv_len
 
-    seq_lens = kv_lens
+    seq_lens = kv_lens + q_lens
     max_seq_len = torch.max(seq_lens).item()
 
     kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
@@ -146,31 +158,42 @@ def test_flashinfer_trtllm_decode_with_baseline(
     workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
 
     # Baseline Decode
-    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer, kv_layout, use_tensor_cores=True)
-    wrapper.plan(kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_qo_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
-                 "NONE",
+    if has_sinks:
+        sinks = torch.rand(num_qo_heads, dtype=torch.float32) * 5
+        wrapper = flashinfer.BatchAttentionWithAttentionSinkWrapper(
+            float_workspace_buffer=workspace_buffer,
+            kv_layout=kv_layout,
+            backend="fa2")
+    else:
+        sinks = None
+        wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+            float_workspace_buffer=workspace_buffer,
+            kv_layout=kv_layout,
+            backend="fa2")
+
+    wrapper.plan(qo_indptr=q_indptr,
+                 paged_kv_indptr=kv_indptr,
+                 paged_kv_indices=kv_indices,
+                 paged_kv_last_page_len=kv_last_page_lens,
+                 num_qo_heads=num_qo_heads,
+                 num_kv_heads=num_kv_heads,
+                 head_dim_qk=head_size,
+                 page_size=block_size,
+                 causal=True,
                  sm_scale=sm_scale,
-                 q_data_type=dtype,
-                 kv_data_type=dtype,
                  window_left=window_left,
-                 logits_soft_cap=soft_cap)
-
+                 logits_soft_cap=soft_cap,
+                 q_data_type=dtype,
+                 kv_data_type=dtype)
     output = torch.empty(ref_query.shape, dtype=dtype)
-    wrapper.run(ref_query, ref_kv_cache, out=output)
+    wrapper.run(ref_query, ref_kv_cache, sinks, sm_scale, out=output)
+
     o_scale = 1.0
     o_sf_scale = None
     if o_quant_dtype == FP8_DTYPE:
         _, o_scale = to_float8(output)
     elif o_quant_dtype == FP4_DTYPE:
-        o_sf_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(output.flatten(), dim=-1)).to(torch.float32)
+        o_sf_scale = get_nvfp4_global_scale(output)
 
     # TRTLLM Decode
     if o_quant_dtype == FP4_DTYPE:
@@ -194,6 +217,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
         bmm1_scale=q_scale * k_scale * sm_scale,
         bmm2_scale=v_scale / o_scale,
         window_left=window_left,
+        sinks=sinks,
         o_sf_scale=o_sf_scale,
         out=output_trtllm,
     )
@@ -210,11 +234,13 @@ def test_flashinfer_trtllm_decode_with_baseline(
                                               query.shape[2])
 
     if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
-        rtol, atol = 3e-1, 1e0
+        rtol, atol = 7e-2, 9e-2
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
-        rtol, atol = 5e-2, 7e-2
-    else:
+        rtol, atol = 2e-2, 4e-2
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
         rtol, atol = 1e-2, 2e-2
+    else:
+        rtol, atol = 1e-2, 1e-2
 
     torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), \
         f"{torch.max(torch.abs(output - output_trtllm))}"
@@ -230,6 +256,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
 @pytest.mark.parametrize("block_size", BLOCK_SIZE)
 @pytest.mark.parametrize("window_left", WINDOW_LEFT)
 @pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("has_sinks", HAS_SINKS)
 @torch.inference_mode
 def test_flashinfer_trtllm_prefill_with_baseline(
     dtype: torch.dtype,
@@ -243,9 +270,10 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     block_size: int,
     window_left: int,
     soft_cap: Optional[float],
+    has_sinks: bool,
 ) -> None:
     torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
+    current_platform.seed_everything(42)
 
     q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
     q_quant_dtype = q_quant_dtype or dtype
@@ -288,7 +316,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         q_scale = 1.0
         ref_query = query
 
-    kv_lens = torch.randint(0, max_kv_len, (batch_size, ), dtype=torch.int32)
+    kv_lens = torch.randint(1, max_kv_len, (batch_size, ), dtype=torch.int32)
     kv_lens[-1] = max_kv_len
 
     seq_lens = kv_lens + q_lens
@@ -328,32 +356,42 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
 
     # Baseline Prefill
-    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, kv_layout)
-    wrapper.plan(q_indptr,
-                 kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_qo_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
+    if has_sinks:
+        sinks = torch.rand(num_qo_heads, dtype=torch.float32) * 5
+        wrapper = flashinfer.BatchAttentionWithAttentionSinkWrapper(
+            float_workspace_buffer=workspace_buffer,
+            kv_layout=kv_layout,
+            backend="fa2")
+    else:
+        sinks = None
+        wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+            float_workspace_buffer=workspace_buffer,
+            kv_layout=kv_layout,
+            backend="fa2")
+
+    wrapper.plan(qo_indptr=q_indptr,
+                 paged_kv_indptr=kv_indptr,
+                 paged_kv_indices=kv_indices,
+                 paged_kv_last_page_len=kv_last_page_lens,
+                 num_qo_heads=num_qo_heads,
+                 num_kv_heads=num_kv_heads,
+                 head_dim_qk=head_size,
+                 page_size=block_size,
                  causal=True,
                  sm_scale=sm_scale,
-                 q_data_type=dtype,
-                 kv_data_type=dtype,
                  window_left=window_left,
-                 logits_soft_cap=soft_cap)
-
+                 logits_soft_cap=soft_cap,
+                 q_data_type=dtype,
+                 kv_data_type=dtype)
     output = torch.empty(ref_query.shape, dtype=dtype)
-    wrapper.run(ref_query, ref_kv_cache, out=output)
+    wrapper.run(ref_query, ref_kv_cache, sinks, sm_scale, out=output)
+
     o_scale = 1.0
     o_sf_scale = None
     if o_quant_dtype == FP8_DTYPE:
         _, o_scale = to_float8(output)
     elif o_quant_dtype == FP4_DTYPE:
-        o_sf_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(output.flatten(), dim=-1)).to(torch.float32)
+        o_sf_scale = get_nvfp4_global_scale(output)
 
     # TRTLLM Prefill
     if o_quant_dtype == FP4_DTYPE:
@@ -381,6 +419,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         cum_seq_lens_q=q_indptr,
         cum_seq_lens_kv=kv_indptr,
         window_left=window_left,
+        sinks=sinks,
         o_sf_scale=o_sf_scale,
         out=output_trtllm,
     )
@@ -397,11 +436,11 @@ def test_flashinfer_trtllm_prefill_with_baseline(
                                               query.shape[2])
 
     if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
-        rtol, atol = 4e-1, 1e0
+        rtol, atol = 1e-1, 2e-1
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
-        rtol, atol = 5e-2, 7e-2
-    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
         rtol, atol = 4e-2, 6e-2
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
+        rtol, atol = 2e-2, 3e-2
     else:
         rtol, atol = 1e-2, 1e-2
 
diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py
@@ -68,8 +68,12 @@ def break_fp4_bytes(a, dtype):
     return values.reshape(m, n * 2).to(dtype=dtype)
 
 
+def get_nvfp4_global_scale(a: torch.Tensor):
+    return ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+            torch.abs(a).max().to(torch.float32))
+
+
 def quant_nvfp4_tensor(a: torch.Tensor):
-    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.abs(a).max().to(torch.float32))
+    a_global_scale = get_nvfp4_global_scale(a)
     a_quant, a_block_scale = scaled_fp4_quant(a, a_global_scale)
     return a_quant, a_block_scale, a_global_scale