[Dev] Adjust computation logic to avoid precision loss when casting acc_s from float to float16 (#141)

chengyupku · web-flow · commit 16b919b8c134 · 2025-03-05T13:36:20.000+08:00
- Remove redundant `acc_s_0` fragment in flash attention kernel
- Simplify memory copy and reduction operations
- Reorder memory copy and scaling steps for improved performance
- Add Hopper-specific synchronization method in CUDA reduce template
- Update reduce operation to use architecture-specific synchronization
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
@@ -31,7 +31,6 @@ def flash_attn(
             K_pe_shared = T.alloc_shared([block_N, pe_dim], dtype)
             O_shared = T.alloc_shared([block_H, dim], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-            acc_s_0 = T.alloc_fragment([block_H, block_N], accum_dtype)
             acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
@@ -57,28 +56,27 @@ def flash_attn(
             for k in T.Pipelined(loop_range, num_stages=2):
                 T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], KV_shared)
                 T.copy(K_pe[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
-                T.clear(acc_s_0)
+                T.clear(acc_s)
                 T.gemm(
-                    Q_shared, KV_shared, acc_s_0, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.gemm(
                     Q_pe_shared,
                     K_pe_shared,
-                    acc_s_0,
+                    acc_s,
                     transpose_B=True,
                     policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
-                T.copy(acc_s_0, S_shared)
-                T.copy(S_shared, acc_s)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
+                T.copy(acc_s, S_shared)
+                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-                T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
                 T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
@@ -105,7 +103,6 @@ def flash_attn_split(
             K_pe_shared = T.alloc_shared([block_N, pe_dim], dtype)
             O_shared = T.alloc_shared([block_H, dim], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-            acc_s_0 = T.alloc_fragment([block_H, block_N], accum_dtype)
             acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
@@ -131,31 +128,29 @@ def flash_attn_split(
             for k in T.Pipelined(loop_range, num_stages=2):
                 kv_start = (seqlen_kv // num_split) * bz + k * block_N
                 kv_end = (seqlen_kv // num_split) * bz + (k + 1) * block_N
-
                 T.copy(KV[bx, kv_start:kv_end, cur_kv_head, :], KV_shared)
                 T.copy(K_pe[bx, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
-                T.clear(acc_s_0)
+                T.clear(acc_s)
                 T.gemm(
-                    Q_shared, KV_shared, acc_s_0, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.gemm(
                     Q_pe_shared,
                     K_pe_shared,
-                    acc_s_0,
+                    acc_s,
                     transpose_B=True,
                     policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
-                T.copy(acc_s_0, S_shared)
-                T.copy(S_shared, acc_s)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
+                T.copy(acc_s, S_shared)
+                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-                T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
                 T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
@@ -301,4 +296,4 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     print("All close")
     latency = mod.do_bench(mod.func, n_warmup=10, n_repeat=10, profiler="torch")
     print("Tile-lang: {:.2f} ms".format(latency))
-    print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
+    print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
@@ -161,8 +161,13 @@ Stmt ReduceOp::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
         continue;
       int reducing_threads = (*extent) * (*scale);
       std::stringstream ss;
-      ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
-         << reducing_threads << ", " << (*scale) << ">::run";
+      if (Downcast<String>(T.target->attrs["arch"]) == "sm_90") {
+        ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
+           << reducing_threads << ", " << (*scale) << ">::run_hopper";
+      } else {
+        ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
+           << reducing_threads << ", " << (*scale) << ">::run";
+      }
       Array<PrimExpr> thread_reduce_args = {
           StringImm(ss.str()), BufferLoad(dst_buffer, dst_indices)};
       if (reducing_threads >= 32) {
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
@@ -33,10 +33,8 @@ template <class Reducer, int threads, int scale> struct AllReduce {
     constexpr int offset = threads / 2;
     if constexpr (offset >= 32) {
       __syncthreads();
-      // asm volatile("bar.sync %0, %1;" : : "r"(1), "r"(256));
       red_buf[threadIdx.x] = x;
       __syncthreads();
-      // asm volatile("bar.sync %0, %1;" : : "r"(2), "r"(256));
       x = Reducer()(x, red_buf[threadIdx.x ^ offset]);
     } else {
       x = Reducer()(x, T(__shfl_xor_sync(uint32_t(-1), x, offset)));
@@ -47,6 +45,24 @@ template <class Reducer, int threads, int scale> struct AllReduce {
       return AllReduce<Reducer, offset, scale>::run(x, red_buf);
     }
   }
+
+  template <typename T>
+  static TL_DEVICE T run_hopper(T x, T *red_buf = nullptr) {
+    constexpr int offset = threads / 2;
+    if constexpr (offset >= 32) {
+      asm volatile("bar.sync %0, %1;" : : "r"(1), "r"(threads));
+      red_buf[threadIdx.x] = x;
+      asm volatile("bar.sync %0, %1;" : : "r"(2), "r"(threads));
+      x = Reducer()(x, red_buf[threadIdx.x ^ offset]);
+    } else {
+      x = Reducer()(x, T(__shfl_xor_sync(uint32_t(-1), x, offset)));
+    }
+    if constexpr (offset == scale) {
+      return x;
+    } else {
+      return AllReduce<Reducer, offset, scale>::run_hopper(x, red_buf);
+    }
+  }
 };
 
 } // namespace tl