Skip to content

Commit 7f5d59a

Browse files
committed
[BugFix] Use release order to boost performance
1 parent 2079660 commit 7f5d59a

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from bert_padding import pad_input, unpad_input
99

1010
# tilelang.disable_cache()
11-
# torch.manual_seed(0)
11+
torch.manual_seed(1)
1212

1313

1414
def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
@@ -371,18 +371,18 @@ def flash_bwd(
371371
T.atomic_add(
372372
dQ[q_start_idx + k_base * block_N + i, bx, d],
373373
dq[i, d],
374-
memory_order="acq_rel")
374+
memory_order="release")
375375

376376
for i, d in T.Parallel(block_M, dim_v):
377377
T.atomic_add(
378378
dV[k_start_idx + by * block_M + i, bx // groups, d],
379379
dv[i, d],
380-
memory_order="acq_rel")
380+
memory_order="release")
381381
for i, d in T.Parallel(block_M, dim_qk):
382382
T.atomic_add(
383383
dK[k_start_idx + by * block_M + i, bx // groups, d],
384384
dk[i, d],
385-
memory_order="acq_rel")
385+
memory_order="release")
386386

387387
return flash_bwd
388388

0 commit comments

Comments
 (0)