update test

minosfuture · minosfuture · commit 1e955be26c0d · 2025-09-30T16:25:20.000-07:00
Signed-off-by: Ming Yang &lt;minos.future@gmail.com&gt;
diff --git a/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp b/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
@@ -1284,8 +1284,13 @@ struct CollectiveMainloopFwdSm90 {
                 auto mask_fn = [&](auto& tSrS, int n_block) { mask.template apply<false /*Seqlenk_mask*/, Is_causal, Is_local>(tSrS, m_block, n_block); };
                 int const m_idx_min = !PackGQA ? m_block * kBlockM : params.qhead_per_khead_divmod.divide(m_block * kBlockM);
                 // If local, blocking (window_size_right + window_size_left)
+                // when cp is not enabled, tot_seqlen_k is equal to seqlen_k, and cp_world_size is 1.
+                // cp_world_size is guaranteed to be greater than 0
                 int const n_block_min_causal_local_mask =
-                    std::max(n_block_min, (m_idx_min + seqlen_k - seqlen_q + params.window_size_right) / kBlockN);
+                    std::max(n_block_min,
+                             (m_idx_min + seqlen_info.tot_seqlen_k - seqlen_q + params.window_size_right) /
+                             seqlen_info.cp_world_size /
+                             kBlockN);
                 #pragma unroll 1
                 for (; n_block >= n_block_min_causal_local_mask; --n_block) {
                     fwd_step(n_block, mask_fn, cute::true_type{} /*check_inf*/);
diff --git a/hopper/seqlen.h b/hopper/seqlen.h
@@ -100,7 +100,7 @@ struct SeqlenInfoQKNewK {
         , seqlen_rotary(!AppendKV || !seqlens_rotary ? seqlen_k_og + leftpad_k : seqlens_rotary[bidb])
         , cp_world_size(cp_world_size)
         , cp_rank(cp_rank)
-        , tot_seqlen_k(cp_tot_seqused_k == nullptr
+        , tot_seqlen_k(cp_tot_seqused_k == nullptr and cp_world_size <= 1
                        ? seqlen_k
                        : cp_tot_seqused_k[bidb])
     {
diff --git a/hopper/test_flash_attn.py b/hopper/test_flash_attn.py
@@ -121,13 +121,20 @@
         (4224, 4224),
     ],
 )
+# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
 @pytest.mark.parametrize(
-    "cp_world_size", [4, 2, 1], # 1 means disabling cp
+    "cp_world_size,cp_rank,cp_tot_seqlen_k_offset",
+    [
+        (8,0,1),
+        (8,7,0),
+        (4,3,2),
+        (2,0,0),
+        (1,0,0), # 1 means disabling cp
+    ],
 )
-#@pytest.mark.parametrize('seqlen_q,seqlen_k', [(1, 1)])
 def test_flash_attn_output(
         seqlen_q, seqlen_k, d, causal, local, softcap, V_colmajor, deterministic, has_qv_, mha_type, dtype, test_sink,
-        cp_world_size,
+        cp_world_size, cp_rank, cp_tot_seqlen_k_offset
 ):
     if V_colmajor and (seqlen_k % 16 != 0 or dtype != torch.float8_e4m3fn):
         pytest.skip("V_colmajor requires seqlen_k to be a multiple of 16 and dtype to be float8_e4m3fn")
@@ -157,7 +164,8 @@ def test_flash_attn_output(
     s_aux = torch.randn(nheads, device=device, dtype=torch.bfloat16) * 4 if test_sink else None
     # s_aux = torch.ones(nheads, device=device, dtype=torch.bfloat16) * 4 if test_sink else None
     # print("s_aux ", s_aux)
-    cp_rank = 0
+    cp_tot_seqlen_k = seqlen_k * cp_world_size + cp_tot_seqlen_k_offset
+    cp_tot_seqlen_k = torch.full((batch_size,), cp_tot_seqlen_k, device=device, dtype=torch.int32)
     if test_sink:
         dv_vals = [d]
     for dv in dv_vals:
@@ -175,7 +183,7 @@ def test_flash_attn_output(
         else:
             qv_ref = None
         # Put window_size after QKV randn so that window_size changes from test to test
-        window_size = (-1, -1) if not local else torch.randint(0, seqlen_k * cp_world_size, (2,))
+        window_size = (-1, -1) if not local else torch.randint(0, cp_tot_seqlen_k[0], (2,))
         # window_size = (-1, -1) if not local else (16, 0)
         if dtype == torch.float8_e4m3fn:
             q_descale, k_descale, v_descale = [torch.rand(batch_size, nheads_kv, device=device, dtype=torch.float32) * 2 for _ in range(3)]
@@ -199,6 +207,7 @@ def test_flash_attn_output(
             s_aux=s_aux,
             cp_world_size=cp_world_size,
             cp_rank=cp_rank,
+            cp_tot_seqlen_k=cp_tot_seqlen_k,
         )
         out_pt, attn_pt = attention_ref(
             q_ref,
@@ -217,6 +226,7 @@ def test_flash_attn_output(
             s_aux=s_aux,
             cp_world_size=cp_world_size,
             cp_rank=cp_rank,
+            cp_tot_seqlen_k=cp_tot_seqlen_k,
         )
 
         # qk = torch.einsum('bshd,bthd->bhst', q_ref, k_ref).float()
@@ -251,6 +261,7 @@ def test_flash_attn_output(
                 s_aux=s_aux,
                 cp_world_size=cp_world_size,
                 cp_rank=cp_rank,
+                cp_tot_seqused_k=cp_tot_seqlen_k,
             )
             print("Pack GQA =", pack_gqa)
             print("Num splits =", num_splits)
@@ -378,7 +389,7 @@ def test_flash_attn_output(
     ],
 )
 def test_flash_attn_varlen_output(
-        seqlen_q, seqlen_k, d, add_unused_qkv, causal, local, softcap, deterministic, has_qv_, mha_type, dtype, test_sink
+        seqlen_q, seqlen_k, d, add_unused_qkv, causal, local, softcap, deterministic, has_qv_, mha_type, dtype, test_sink,
 ):
     if has_qv_ and (d != 64 or dtype == torch.float8_e4m3fn):
         pytest.skip("Has Qv requires hdim 64 and dtype to be float16 or bfloat16 (not float8_e4m3fn)")
diff --git a/hopper/test_util.py b/hopper/test_util.py
@@ -165,13 +165,15 @@ def construct_local_mask(
     device=None,
     cp_world_size=1,
     cp_rank=0,
+    cp_tot_seqlen_k=None,
 ):
     if cp_world_size > 1:
         return construct_cp_mask(
             seqlen_q,
             seqlen_k,
             cp_world_size=cp_world_size,
             cp_rank=cp_rank,
+            cp_tot_seqlen_k=cp_tot_seqlen_k,
             window_size=window_size,
             sink_token_length=sink_token_length,
             query_padding_mask=query_padding_mask,
@@ -209,6 +211,7 @@ def construct_cp_mask(
     seqlen_k,
     cp_world_size=1,
     cp_rank=0,
+    cp_tot_seqlen_k=None,
     window_size=(-1, -1),  # -1 means infinite window size
     sink_token_length=0,
     query_padding_mask=None,
@@ -250,7 +253,7 @@ def construct_cp_mask(
 
     # Calculate effective sequence lengths
     sk = (
-        torch.tensor(seqlen_k * cp_world_size, device=device, dtype=torch.long)  # Global seqlen_k for DCP
+        cp_tot_seqlen_k[0]
         if key_padding_mask is None
         else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") * cp_world_size
     )
@@ -300,7 +303,6 @@ def construct_cp_mask(
                 ),
             )
 
-    print(f"cp {mask=}")
     return mask
 
 
@@ -326,6 +328,7 @@ def attention_ref(
     s_aux=None,
     cp_world_size=1,
     cp_rank=0,
+    cp_tot_seqlen_k=None,
 ):
     """
     Arguments:
@@ -396,6 +399,7 @@ def attention_ref(
             device=q.device,
             cp_world_size=cp_world_size,
             cp_rank=cp_rank,
+            cp_tot_seqlen_k=cp_tot_seqlen_k,
         )
         scores.masked_fill_(local_mask, float("-inf"))
     if attn_bias is not None:

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ struct SeqlenInfoQKNewK {`
`100`	`100`	`, seqlen_rotary(!AppendKV \|\| !seqlens_rotary ? seqlen_k_og + leftpad_k : seqlens_rotary[bidb])`
`101`	`101`	`, cp_world_size(cp_world_size)`
`102`	`102`	`, cp_rank(cp_rank)`
`103`		`- , tot_seqlen_k(cp_tot_seqused_k == nullptr`
	`103`	`+ , tot_seqlen_k(cp_tot_seqused_k == nullptr and cp_world_size <= 1`
`104`	`104`	`? seqlen_k`
`105`	`105`	`: cp_tot_seqused_k[bidb])`
`106`	`106`	`{`