cleanup and add comments

minosfuture · minosfuture · commit 2cd9f3f3cc7e · 2025-10-01T10:42:23.000-07:00
Signed-off-by: Ming Yang &lt;minos.future@gmail.com&gt;
diff --git a/hopper/flash_api.cpp b/hopper/flash_api.cpp
@@ -435,7 +435,7 @@ inline bool get_pack_gqa(Flash_fwd_params const& params) {
     // Always enable PackGQA for special case of hdim = 64, qheads/kvheads = 8, local attention
     // TODO: investigate more cases where PackGQA improves perf due to better tile quantization
     bool const packgqa_override = params.arch >= 90 && (params.h / params.h_k) == 8 && 
-                                  params.is_local &&
+                                  params.is_local && 
                                   params.d == 64 && (params.dv == params.d);
     if (packgqa_override) { return true; }
     #ifdef FLASHATTENTION_DISABLE_PACKGQA
diff --git a/hopper/seqlen.h b/hopper/seqlen.h
@@ -56,7 +56,7 @@ struct SeqlenInfoQK {
                    ? seqlen_k_static
                    : (seqused_k ? seqused_k[bidb] : (cu_seqlens_k ? cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb] : seqlen_k_static)))
         , cp_world_size(cp_world_size)
-        , tot_seqlen_k(cp_tot_seqused_k == nullptr
+        , tot_seqlen_k(cp_tot_seqused_k == nullptr and cp_world_size <= 1
                        ? seqlen_k
                        : cp_tot_seqused_k[bidb])
     {
diff --git a/hopper/test_flash_attn.py b/hopper/test_flash_attn.py
@@ -389,7 +389,7 @@ def test_flash_attn_output(
     ],
 )
 def test_flash_attn_varlen_output(
-        seqlen_q, seqlen_k, d, add_unused_qkv, causal, local, softcap, deterministic, has_qv_, mha_type, dtype, test_sink,
+        seqlen_q, seqlen_k, d, add_unused_qkv, causal, local, softcap, deterministic, has_qv_, mha_type, dtype, test_sink
 ):
     if has_qv_ and (d != 64 or dtype == torch.float8_e4m3fn):
         pytest.skip("Has Qv requires hdim 64 and dtype to be float16 or bfloat16 (not float8_e4m3fn)")
diff --git a/hopper/test_util.py b/hopper/test_util.py
@@ -231,6 +231,7 @@ def construct_cp_mask(
         seqlen_k: Length of key sequence (local to this rank)
         cp_world_size: Number of context parallel ranks
         cp_rank: Current rank ID (0 to cp_world_size-1)
+        cp_tot_seqlen_k: Total lengths of key sequence in cp world
         window_size: (left_window, right_window), -1 = infinite
         sink_token_length: Number of "sink" tokens that can always be attended to
         query_padding_mask: Which query positions are valid
@@ -350,6 +351,7 @@ def attention_ref(
         s_aux: (nheads)
         cp_world_size: Number of context parallel ranks
         cp_rank: Current rank ID (0 to cp_world_size-1)
+        cp_tot_seqlen_k:  (batch_size) total seqlen of k/v in cp world
     Output:
         output: (batch_size, seqlen_q, nheads, head_dim_v)
         attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ struct SeqlenInfoQK {`
`56`	`56`	`? seqlen_k_static`
`57`	`57`	`: (seqused_k ? seqused_k[bidb] : (cu_seqlens_k ? cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb] : seqlen_k_static)))`
`58`	`58`	`, cp_world_size(cp_world_size)`
`59`		`- , tot_seqlen_k(cp_tot_seqused_k == nullptr`
	`59`	`+ , tot_seqlen_k(cp_tot_seqused_k == nullptr and cp_world_size <= 1`
`60`	`60`	`? seqlen_k`
`61`	`61`	`: cp_tot_seqused_k[bidb])`
`62`	`62`	`{`
Original file line number	Diff line number	Diff line change
`@@ -389,7 +389,7 @@ def test_flash_attn_output(`
`389`	`389`	`],`
`390`	`390`	`)`
`391`	`391`	`def test_flash_attn_varlen_output(`
`392`		`- seqlen_q, seqlen_k, d, add_unused_qkv, causal, local, softcap, deterministic, has_qv_, mha_type, dtype, test_sink,`
	`392`	`+ seqlen_q, seqlen_k, d, add_unused_qkv, causal, local, softcap, deterministic, has_qv_, mha_type, dtype, test_sink`
`393`	`393`	`):`
`394`	`394`	`if has_qv_ and (d != 64 or dtype == torch.float8_e4m3fn):`
`395`	`395`	`pytest.skip("Has Qv requires hdim 64 and dtype to be float16 or bfloat16 (not float8_e4m3fn)")`