Add maybe_s_aux to single decode module additional_tensor_names

djmmoss · djmmoss · commit 62859a988c69 · 2025-11-17T10:40:19.000-08:00
Update single decode module's additional_tensor_names to include 'maybe_s_aux'
along with the corresponding 'float' entry in additional_tensor_dtypes.
This matches the batch decode definition and enables sink attention support
for single decode operations.
diff --git a/flashinfer/jit/attention/modules.py b/flashinfer/jit/attention/modules.py
@@ -467,8 +467,8 @@ def gen_single_decode_module(
         dtype_o,
         head_dim_qk,
         head_dim_vo,
-        ["maybe_alibi_slopes"],  # additional_tensor_names
-        ["float"],  # additional_tensor_dtypes
+        ["maybe_alibi_slopes", "maybe_s_aux"],  # additional_tensor_names
+        ["float", "float"],  # additional_tensor_dtypes
         [
             "logits_soft_cap",
             "sm_scale",
@@ -516,7 +516,11 @@ def gen_single_prefill_module(
 
     if backend == "fa2":
         assert not fp8_enabled, "fp8 tensor core is not supported in fa2 backend"
-        additional_tensor_names = ["maybe_custom_mask", "maybe_alibi_slopes", "maybe_s_aux"]
+        additional_tensor_names = [
+            "maybe_custom_mask",
+            "maybe_alibi_slopes",
+            "maybe_s_aux",
+        ]
         additional_tensor_dtypes = ["uint8_t", "float", "float"]
         additional_scalar_names = [
             "logits_soft_cap",
diff --git a/include/flashinfer/attention/default_prefill_params.cuh b/include/flashinfer/attention/default_prefill_params.cuh
@@ -88,8 +88,7 @@ struct SinglePrefillParams {
         partition_kv(false) {}
 
   __host__ SinglePrefillParams(DTypeQ* q, DTypeKV* k, DTypeKV* v, uint8_t* maybe_custom_mask,
-                               DTypeO* o, float* lse, float* maybe_alibi_slopes,
-                               float* maybe_s_aux,
+                               DTypeO* o, float* lse, float* maybe_alibi_slopes, float* maybe_s_aux,
                                uint32_t num_qo_heads, uint32_t num_kv_heads, uint32_t qo_len,
                                uint32_t kv_len, uint32_t q_stride_n, uint32_t q_stride_h,
                                uint32_t kv_stride_n, uint32_t kv_stride_h, uint32_t head_dim,
@@ -230,10 +229,9 @@ struct BatchPrefillRaggedParams {
                                     IdType* q_indptr, IdType* kv_indptr, IdType* maybe_mask_indptr,
                                     IdType* maybe_q_rope_offset, IdType* maybe_k_rope_offset,
                                     DTypeO* o, float* lse, float* maybe_alibi_slopes,
-                                    float* maybe_s_aux,
-                                    uint32_t num_qo_heads, uint32_t num_kv_heads,
-                                    uint32_t q_stride_n, uint32_t q_stride_h, uint32_t kv_stride_n,
-                                    uint32_t kv_stride_h, int32_t window_left,
+                                    float* maybe_s_aux, uint32_t num_qo_heads,
+                                    uint32_t num_kv_heads, uint32_t q_stride_n, uint32_t q_stride_h,
+                                    uint32_t kv_stride_n, uint32_t kv_stride_h, int32_t window_left,
                                     float logits_soft_cap, float sm_scale, float rope_scale,
                                     float rope_theta)
       : q(q),
@@ -371,10 +369,9 @@ struct BatchPrefillPagedParams {
                                    uint8_t* maybe_custom_mask, IdType* q_indptr,
                                    IdType* maybe_mask_indptr, IdType* maybe_q_rope_offset,
                                    DTypeO* o, float* lse, float* maybe_alibi_slopes,
-                                   float* maybe_s_aux,
-                                   uint32_t num_qo_heads, IdType q_stride_n, IdType q_stride_h,
-                                   int32_t window_left, float logits_soft_cap, float sm_scale,
-                                   float rope_scale, float rope_theta)
+                                   float* maybe_s_aux, uint32_t num_qo_heads, IdType q_stride_n,
+                                   IdType q_stride_h, int32_t window_left, float logits_soft_cap,
+                                   float sm_scale, float rope_scale, float rope_theta)
       : q(q),
         paged_kv(paged_kv),
         maybe_custom_mask(maybe_custom_mask),
diff --git a/tests/attention/test_decode_sink_attention.py b/tests/attention/test_decode_sink_attention.py
@@ -386,8 +386,12 @@ def test_single_decode_sink_attention_tensor_cores(
         k_cache_ref = k.unsqueeze(0)  # [1, kv_len, num_kv_heads, head_dim]
         v_cache_ref = v.unsqueeze(0)  # [1, kv_len, num_kv_heads, head_dim]
     else:  # HND -> transpose to NHD
-        k_cache_ref = k.transpose(0, 1).unsqueeze(0)  # [1, kv_len, num_kv_heads, head_dim]
-        v_cache_ref = v.transpose(0, 1).unsqueeze(0)  # [1, kv_len, num_kv_heads, head_dim]
+        k_cache_ref = k.transpose(0, 1).unsqueeze(
+            0
+        )  # [1, kv_len, num_kv_heads, head_dim]
+        v_cache_ref = v.transpose(0, 1).unsqueeze(
+            0
+        )  # [1, kv_len, num_kv_heads, head_dim]
 
     # Compute reference output
     out_ref = sink_attention_decode_ref(