Add attention sink support to tensor core template for decode attention

djmmoss · djmmoss · commit 58f66f80ee2b · 2025-11-13T16:30:51.000-08:00
Add maybe_s_aux support to prefill template used for decode attention
when use_tensor_cores=True. Includes updates to params structures,
variant handling, JIT generation, Python wrappers, and comprehensive
test coverage with validation.
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -388,6 +388,7 @@ def single_decode_with_kv_cache(
     rope_scale: Optional[float] = None,
     rope_theta: Optional[float] = None,
     return_lse: Literal[True] = True,
+    sinks: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
 
@@ -407,6 +408,7 @@ def single_decode_with_kv_cache(
     rope_scale: Optional[float] = None,
     rope_theta: Optional[float] = None,
     return_lse: bool = False,
+    sinks: Optional[torch.Tensor] = None,
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     r"""Decode attention with KV Cache for single request, return attention output.
 
@@ -533,6 +535,7 @@ def single_decode_with_kv_cache(
             window_left,
             None,  # packed_custom_mask
             _get_cache_alibi_slopes_buf(num_qo_heads, q.device),
+            sinks,  # maybe_s_aux
             logits_soft_cap,
             sm_scale,
             None,  # scale_q, not supported yet
diff --git a/flashinfer/jit/attention/modules.py b/flashinfer/jit/attention/modules.py
@@ -467,8 +467,8 @@ def gen_single_decode_module(
         dtype_o,
         head_dim_qk,
         head_dim_vo,
-        ["maybe_alibi_slopes", "maybe_s_aux"],  # additional_tensor_names
-        ["float", "float"],  # additional_tensor_dtypes
+        ["maybe_alibi_slopes"],  # additional_tensor_names
+        ["float"],  # additional_tensor_dtypes
         [
             "logits_soft_cap",
             "sm_scale",
@@ -516,8 +516,8 @@ def gen_single_prefill_module(
 
     if backend == "fa2":
         assert not fp8_enabled, "fp8 tensor core is not supported in fa2 backend"
-        additional_tensor_names = ["maybe_custom_mask", "maybe_alibi_slopes"]
-        additional_tensor_dtypes = ["uint8_t", "float"]
+        additional_tensor_names = ["maybe_custom_mask", "maybe_alibi_slopes", "maybe_s_aux"]
+        additional_tensor_dtypes = ["uint8_t", "float", "float"]
         additional_scalar_names = [
             "logits_soft_cap",
             "sm_scale",
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -277,6 +277,7 @@ def run_single_prefill(
         window_left: int,
         maybe_packed_custom_mask: Optional[torch.Tensor],
         maybe_alibi_slopes: Optional[torch.Tensor],
+        maybe_s_aux: Optional[torch.Tensor],
         logits_soft_cap: float,
         sm_scale: float,
         scale_q: Optional[torch.Tensor],
@@ -330,6 +331,7 @@ def run_single_prefill(
                 window_left,
                 maybe_packed_custom_mask,
                 maybe_alibi_slopes,
+                maybe_s_aux,
                 logits_soft_cap,
                 sm_scale,
                 1.0 / rope_scale,  # rope_rcp_scale
@@ -350,6 +352,7 @@ def _fake_run_single_prefill(
         window_left: int,
         maybe_packed_custom_mask: Optional[torch.Tensor],
         maybe_alibi_slopes: Optional[torch.Tensor],
+        maybe_s_aux: Optional[torch.Tensor],
         logits_soft_cap: float,
         sm_scale: float,
         rope_scale: float,
diff --git a/include/flashinfer/attention/default_prefill_params.cuh b/include/flashinfer/attention/default_prefill_params.cuh
@@ -38,6 +38,7 @@ struct SinglePrefillParams {
   DTypeO* o;
   float* lse;
   float* maybe_alibi_slopes;
+  float* maybe_s_aux;
   uint_fastdiv group_size;
   uint32_t qo_len;
   uint32_t kv_len;
@@ -66,6 +67,7 @@ struct SinglePrefillParams {
         o(nullptr),
         lse(nullptr),
         maybe_alibi_slopes(nullptr),
+        maybe_s_aux(nullptr),
         group_size(),
         qo_len(0),
         kv_len(0),
@@ -87,6 +89,7 @@ struct SinglePrefillParams {
 
   __host__ SinglePrefillParams(DTypeQ* q, DTypeKV* k, DTypeKV* v, uint8_t* maybe_custom_mask,
                                DTypeO* o, float* lse, float* maybe_alibi_slopes,
+                               float* maybe_s_aux,
                                uint32_t num_qo_heads, uint32_t num_kv_heads, uint32_t qo_len,
                                uint32_t kv_len, uint32_t q_stride_n, uint32_t q_stride_h,
                                uint32_t kv_stride_n, uint32_t kv_stride_h, uint32_t head_dim,
@@ -99,6 +102,7 @@ struct SinglePrefillParams {
         o(o),
         lse(lse),
         maybe_alibi_slopes(maybe_alibi_slopes),
+        maybe_s_aux(maybe_s_aux),
         group_size(num_qo_heads / num_kv_heads),
         num_qo_heads(num_qo_heads),
         num_kv_heads(num_kv_heads),
@@ -146,6 +150,7 @@ struct BatchPrefillRaggedParams {
   DTypeO* o;
   float* lse;
   float* maybe_alibi_slopes;
+  float* maybe_s_aux;
   uint_fastdiv group_size;
   uint32_t num_qo_heads;
   uint32_t num_kv_heads;
@@ -190,6 +195,7 @@ struct BatchPrefillRaggedParams {
         o(nullptr),
         lse(nullptr),
         maybe_alibi_slopes(nullptr),
+        maybe_s_aux(nullptr),
         group_size(),
         num_qo_heads(0),
         num_kv_heads(0),
@@ -224,6 +230,7 @@ struct BatchPrefillRaggedParams {
                                     IdType* q_indptr, IdType* kv_indptr, IdType* maybe_mask_indptr,
                                     IdType* maybe_q_rope_offset, IdType* maybe_k_rope_offset,
                                     DTypeO* o, float* lse, float* maybe_alibi_slopes,
+                                    float* maybe_s_aux,
                                     uint32_t num_qo_heads, uint32_t num_kv_heads,
                                     uint32_t q_stride_n, uint32_t q_stride_h, uint32_t kv_stride_n,
                                     uint32_t kv_stride_h, int32_t window_left,
@@ -241,6 +248,7 @@ struct BatchPrefillRaggedParams {
         o(o),
         lse(lse),
         maybe_alibi_slopes(maybe_alibi_slopes),
+        maybe_s_aux(maybe_s_aux),
         group_size(num_qo_heads / num_kv_heads),
         num_qo_heads(num_qo_heads),
         num_kv_heads(num_kv_heads),
@@ -296,6 +304,7 @@ struct BatchPrefillPagedParams {
   DTypeO* o;
   float* lse;
   float* maybe_alibi_slopes;
+  float* maybe_s_aux;
   uint_fastdiv group_size;
   uint32_t num_qo_heads;
   IdType q_stride_n;
@@ -332,6 +341,7 @@ struct BatchPrefillPagedParams {
         o(nullptr),
         lse(nullptr),
         maybe_alibi_slopes(nullptr),
+        maybe_s_aux(nullptr),
         group_size(),
         num_qo_heads(0),
         q_stride_n(0),
@@ -361,6 +371,7 @@ struct BatchPrefillPagedParams {
                                    uint8_t* maybe_custom_mask, IdType* q_indptr,
                                    IdType* maybe_mask_indptr, IdType* maybe_q_rope_offset,
                                    DTypeO* o, float* lse, float* maybe_alibi_slopes,
+                                   float* maybe_s_aux,
                                    uint32_t num_qo_heads, IdType q_stride_n, IdType q_stride_h,
                                    int32_t window_left, float logits_soft_cap, float sm_scale,
                                    float rope_scale, float rope_theta)
@@ -373,6 +384,7 @@ struct BatchPrefillPagedParams {
         o(o),
         lse(lse),
         maybe_alibi_slopes(maybe_alibi_slopes),
+        maybe_s_aux(maybe_s_aux),
         group_size(num_qo_heads / paged_kv.num_heads),
         num_qo_heads(num_qo_heads),
         q_stride_n(q_stride_n),
diff --git a/include/flashinfer/attention/variants.cuh b/include/flashinfer/attention/variants.cuh
@@ -90,6 +90,16 @@ struct DefaultAttention : AttentionVariantBase {
     }
     return mask;
   })
+
+  REGISTER_M_D_UPDATE(params, kv_tile_idx, qo_head_idx, m, d, scale, {
+    if constexpr (use_softmax) {
+      if (params.maybe_s_aux != nullptr) {
+        constexpr float LOG2_E = 1.4426950408889634f;  // log2(e)
+        float s_aux_val = params.maybe_s_aux[qo_head_idx];
+        d += math::ptx_exp2((s_aux_val - m) * LOG2_E);
+      }
+    }
+  })
 };
 
 };  // namespace flashinfer
diff --git a/tests/attention/test_decode_sink_attention.py b/tests/attention/test_decode_sink_attention.py
@@ -257,7 +257,7 @@ def test_batch_decode_without_sink_attention(
 
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("kv_len", [64])
-@pytest.mark.parametrize("num_qo_heads", [8])
+@pytest.mark.parametrize("num_qo_heads", [16])
 @pytest.mark.parametrize("num_kv_heads", [8])
 @pytest.mark.parametrize("head_dim", [64])
 def test_batch_decode_sink_attention_gqa(
@@ -321,5 +321,88 @@ def test_batch_decode_sink_attention_gqa(
     assert not torch.isinf(out).any()
 
 
+@pytest.mark.parametrize("kv_len", [32, 128, 512])
+@pytest.mark.parametrize(
+    "num_qo_heads,num_kv_heads",
+    [
+        (8, 8),  # MHA: equal heads
+        (16, 8),  # GQA: 2:1 ratio
+        (32, 8),  # GQA: 4:1 ratio
+        (32, 32),  # MHA: equal heads
+    ],
+)
+@pytest.mark.parametrize("head_dim", [64, 128])
+@pytest.mark.parametrize("kv_layout", ["NHD", "HND"])
+def test_single_decode_sink_attention_tensor_cores(
+    kv_len, num_qo_heads, num_kv_heads, head_dim, kv_layout
+):
+    """Test sink attention with single decode using tensor cores (prefill template)."""
+    torch.manual_seed(42)
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16
+
+    sm_scale = 1.0 / math.sqrt(head_dim)
+    window_left = -1  # No sliding window
+
+    # Create query tensor
+    q = torch.randn(num_qo_heads, head_dim, dtype=dtype, device=device)
+
+    # Create KV cache based on layout
+    if kv_layout == "NHD":
+        k = torch.randn(kv_len, num_kv_heads, head_dim, dtype=dtype, device=device)
+        v = torch.randn(kv_len, num_kv_heads, head_dim, dtype=dtype, device=device)
+    else:  # HND
+        k = torch.randn(num_kv_heads, kv_len, head_dim, dtype=dtype, device=device)
+        v = torch.randn(num_kv_heads, kv_len, head_dim, dtype=dtype, device=device)
+
+    # Sink tensor should have num_qo_heads elements
+    # Sink values should be on similar scale to logits (QK^T * sm_scale)
+    sinks = torch.randn(num_qo_heads, device=device, dtype=torch.float32) * 0.5
+
+    # Test with tensor cores enabled (uses prefill template)
+    out = flashinfer.single_decode_with_kv_cache(
+        q,
+        k,
+        v,
+        kv_layout=kv_layout,
+        pos_encoding_mode="NONE",
+        use_tensor_cores=True,
+        sm_scale=sm_scale,
+        sinks=sinks,
+    )
+
+    # Basic sanity check: output should have correct shape
+    assert out.shape == (num_qo_heads, head_dim)
+    assert out.dtype == dtype
+    assert not torch.isnan(out).any()
+    assert not torch.isinf(out).any()
+
+    # Validate against reference implementation
+    # Convert to batch format for reference (add batch dimension)
+    q_batch = q.unsqueeze(0)  # [1, num_qo_heads, head_dim]
+
+    # Convert KV cache to reference format [batch_size, kv_len, num_kv_heads, head_dim]
+    if kv_layout == "NHD":
+        k_cache_ref = k.unsqueeze(0)  # [1, kv_len, num_kv_heads, head_dim]
+        v_cache_ref = v.unsqueeze(0)  # [1, kv_len, num_kv_heads, head_dim]
+    else:  # HND -> transpose to NHD
+        k_cache_ref = k.transpose(0, 1).unsqueeze(0)  # [1, kv_len, num_kv_heads, head_dim]
+        v_cache_ref = v.transpose(0, 1).unsqueeze(0)  # [1, kv_len, num_kv_heads, head_dim]
+
+    # Compute reference output
+    out_ref = sink_attention_decode_ref(
+        q_batch, k_cache_ref, v_cache_ref, sinks, window_left, sm_scale
+    )
+
+    # Remove batch dimension from reference output
+    out_ref = out_ref.squeeze(0)  # [num_qo_heads, head_dim]
+
+    # Compare results
+    # bfloat16 may have slightly larger numerical differences due to lower precision,
+    # differences in order of operations between reference and CUDA kernel, and
+    # GQA scenarios where multiple query heads share KV heads
+    torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=3.5e-2)
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])