[Feature] Add 1D TMA support (#761)

tzj-fxz · chengyupku · johnnynunez · web-flow · commit 1774a1aae743 · 2025-08-28T12:39:46.000+08:00
* [Feature] Add 1D TMA support - Check the contiguous conditions of 1D TMA copy - Add new interface and params order of `tma_load` and `tma_store` call - Add 1D `tma_store` interface in sm90 template - Add elementwise kernel for 1D TMA example * [Lint] * [BugFix] Add conditions for 1D TMA copy on non-swizzle shared tensors * [Lint] * [BugFix] 1D TMA load * [README] Update GDN README for clarity and add acknowledgements (#758) - Improved formatting and clarity of the GDN kernel implementation description. - Updated requirement section to list dependencies in a clearer format. - Added an acknowledgements section to credit the developers and the Xiaomi LLM-Core Team for their contributions. * cutlass v4.2.0 supporting cuda 13 (#760) * [Lint] * [Lint] * [MXFP4] Add test for bf16&mxfp4 gemm * [BugFix] * [Lint] --------- Co-authored-by: Yu Cheng <54519279+chengyupku@users.noreply.github.com> Co-authored-by: Johnny <johnnync13@gmail.com>
diff --git a/examples/dequantize_gemm/test_example_dequantize_gemm.py b/examples/dequantize_gemm/test_example_dequantize_gemm.py
@@ -2,6 +2,7 @@
 
 import example_dequant_gemv_fp16xint4
 import example_dequant_gemm_fp4_hopper
+import example_dequant_gemm_bf16_mxfp4_hopper
 
 
 @tilelang.testing.requires_cuda
@@ -15,5 +16,11 @@ def test_example_dequant_gemm_fp4_hopper():
     example_dequant_gemm_fp4_hopper.main()
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_example_dequant_gemm_bf16_mxfp4_hopper():
+    example_dequant_gemm_bf16_mxfp4_hopper.main()
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/examples/elementwise/example_elementwise_add_tma_1d.py b/examples/elementwise/example_elementwise_add_tma_1d.py
@@ -0,0 +1,53 @@
+import argparse
+import tilelang
+import tilelang.language as T
+import torch
+
+
+def ref_program(x, y):
+    return x + y
+
+
+@tilelang.jit(out_idx=[-1])
+def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
+
+    @T.prim_func
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor(
+        (M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_N), in_dtype)
+            B_shared = T.alloc_shared((block_M, block_N), in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), out_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            T.copy(A[by * block_M, bx * block_N], A_shared)
+            T.copy(B[by * block_M, bx * block_N], B_shared)
+            for (local_y, local_x) in T.Parallel(block_M, block_N):
+                C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return elem_add
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--m", type=int, default=128)
+    parser.add_argument("--n", type=int, default=128)
+    args, _ = parser.parse_known_args()
+    M, N = args.m, args.n
+
+    a = torch.randn(M, N, dtype=torch.float32, device="cuda")
+    b = torch.randn(M, N, dtype=torch.float32, device="cuda")
+
+    # Default config
+    config = {"block_M": 128, "block_N": 128, "threads": 128}
+    kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
+
+    out = kernel(a, b)
+    torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
+    print("All passed!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/elementwise/test_example_elementwise.py b/examples/elementwise/test_example_elementwise.py
@@ -1,10 +1,15 @@
 import tilelang.testing
 import example_elementwise_add
+import example_elementwise_add_tma_1d
 
 
 def test_example_elementwise_add():
     example_elementwise_add.main()
 
 
+def test_example_elementwise_add_tma_1d():
+    example_elementwise_add_tma_1d.main()
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
@@ -18,7 +18,6 @@
 
 import torch
 import torch.nn.functional as F
-from utils import assert_similar
 
 torch.random.manual_seed(0)
 torch.set_printoptions(profile="full")
@@ -504,6 +503,7 @@ def run_test(
     dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
         dim=-1)
 
+    from utils import assert_similar
     assert_similar(dk_ref, dk_tilelang, eps=1e-5, name="dk", raise_assert=False)
     assert_similar(dv_ref, dv_tilelang, eps=1e-5, name="dv", raise_assert=False)
     assert_similar(dbeta_ref, dbeta_tilelang, eps=1e-5, name="dbeta", raise_assert=False)
diff --git a/src/op/copy.cc b/src/op/copy.cc
@@ -772,19 +772,133 @@ Stmt Copy::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     stride *= s;
   }
 
+  Array<PrimExpr> global_indices;
+  for (auto r : global_range) {
+    global_indices.push_back(r->min);
+  }
+  std::vector<PrimExpr> global_strides;
+  PrimExpr global_stride = 1;
+  for (size_t i = 0; i < global_tensor->shape.size(); i++) {
+    auto s = global_tensor->shape[global_tensor->shape.size() - i - 1];
+    global_strides.insert(global_strides.begin(), global_stride);
+    global_stride *= s;
+  }
+
   ICHECK(strides.size() == indices.size())
       << "strides.size() != indices.size()" << strides.size() << " "
       << indices.size();
   PrimExpr offset = 0;
   for (size_t i = 0; i < indices.size(); i++) {
     offset += indices[i] * strides[i];
   }
+  PrimExpr global_offset = 0;
+  for (size_t i = 0; i < global_indices.size(); i++) {
+    global_offset += global_indices[i] * global_strides[i];
+  }
+  auto shared_tensor_before_remap = shared_tensor;
   Layout shared_layout;
   if (T.layout_map.count(shared_tensor)) {
     shared_layout = T.layout_map[shared_tensor];
     shared_tensor = T.buffer_remap[shared_tensor];
   }
 
+  // Add 1D TMA copy when the global and shared memory is contiguous
+  {
+    // Check if shared_tensor->name is present in T.buffer_var_gemm
+    // (Array<PrimExpr>) to avoid use 1D TMA copy for swizzled layout
+    bool shared_is_contiguous = true;
+    for (const auto &v : T.buffer_var_gemm) {
+      if (v->name_hint == shared_tensor->name) {
+        shared_is_contiguous = false;
+        break;
+      }
+    }
+    bool shared_not_full_dim_encounter = false;
+    for (ssize_t i = shared_range.size() - 1; i >= 0; --i) {
+      if (!shared_not_full_dim_encounter) {
+        if (!analyzer->CanProve(shared_range[i]->extent ==
+                                    shared_tensor_before_remap->shape[i] &&
+                                shared_range[i]->min == 0)) {
+          shared_not_full_dim_encounter = true;
+        }
+      } else {
+        if (!analyzer->CanProve(shared_range[i]->extent == 1)) {
+          shared_is_contiguous = false;
+          break;
+        }
+      }
+    }
+    // Currently we check the empty stride of global tensor
+    bool global_is_contiguous = !global_tensor->strides.empty();
+    bool global_not_full_dim_encounter = false;
+    for (ssize_t i = global_range.size() - 1; i >= 0; --i) {
+      if (!global_not_full_dim_encounter) {
+        if (!analyzer->CanProve(global_range[i]->extent ==
+                                    global_tensor->shape[i] &&
+                                global_range[i]->min == 0)) {
+          global_not_full_dim_encounter = true;
+        }
+      } else {
+        if (!analyzer->CanProve(global_range[i]->extent == 1)) {
+          global_is_contiguous = false;
+          break;
+        }
+      }
+    }
+    // Ensure there is element match and no OOB
+    PrimExpr shared_elements = 1;
+    for (size_t i = 0; i < shared_range.size(); i++) {
+      shared_elements *= shared_range[i]->extent;
+    }
+    PrimExpr global_elements = 1;
+    for (size_t i = 0; i < global_range.size(); i++) {
+      global_elements *= global_range[i]->extent;
+    }
+    bool element_match =
+        analyzer->CanProveEqual(shared_elements, global_elements);
+    bool no_oob = true;
+    for (size_t i = 0; i < shared_range.size(); i++) {
+      if (!analyzer->CanProve(shared_range[i]->min + shared_range[i]->extent <=
+                              shared_tensor_before_remap->shape[i])) {
+        no_oob = false;
+        break;
+      }
+    }
+    for (size_t i = 0; i < global_range.size(); i++) {
+      if (!analyzer->CanProve(global_range[i]->min + global_range[i]->extent <=
+                              global_tensor->shape[i])) {
+        no_oob = false;
+        break;
+      }
+    }
+    // Add 1D TMA copy only for load
+    if (shared_is_contiguous && global_is_contiguous && element_match &&
+        no_oob && is_load) {
+      PrimExpr elements = analyzer->Simplify(shared_elements);
+      PrimExpr shared_addr = shared_tensor_before_remap.access_ptr(
+          is_load ? 2 : 1, DataType::Handle(), 1, offset, elements);
+      PrimExpr global_addr = global_tensor.access_ptr(
+          is_load ? 1 : 2, DataType::Handle(), 1, global_offset, elements);
+      Stmt tma_copy;
+      if (is_load) {
+        // the zero is a placeholder for mbarrier id
+        tma_copy =
+            Evaluate(Call(DataType::Handle(), tma_load(),
+                          {shared_addr, global_addr, 0,
+                           elements * shared_tensor_before_remap->dtype.bytes(),
+                           this->eviction_policy}));
+      } else {
+        tma_copy =
+            Evaluate(Call(DataType::Handle(), tma_store(),
+                          {global_addr, shared_addr,
+                           elements * shared_tensor_before_remap->dtype.bytes(),
+                           this->eviction_policy}));
+      }
+      tma_copy = IfThenElse(EQ(T.thread_var, T.thread_bounds->min), tma_copy);
+      return tma_copy;
+    }
+  }
+
   TMADesc desc;
   // Verify copy rank
   desc.rank = global_tensor->shape.size();
@@ -1221,10 +1335,11 @@ Array<PrimExpr> TMAIm2ColDesc::EncodeCallArgs() const {
 
 // Register the Copy operation with TVM's TIR system
 // This makes the copy operation available for use in TVM programs
-// - Takes 4 inputs: src_buffer, dst_buffer, coalesced_width, disable_tma
+// - Takes 5 inputs: src_buffer, dst_buffer, coalesced_width, disable_tma,
+// eviction_policy
 // - Marked as opaque since it has side effects (memory writes)
 TIR_REGISTER_TL_OP(Copy, copy)
-    .set_num_inputs(4)
+    .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
diff --git a/src/op/op.h b/src/op/op.h
@@ -49,6 +49,7 @@ struct LowerArgs {
   AddWorkspaceCallback AddWorkspace;
   LayoutMap layout_map;
   Map<Buffer, Buffer> buffer_remap;
+  Array<Var> buffer_var_gemm;
 };
 
 struct LayoutInferArgs {
diff --git a/src/tl_templates/cuda/copy_sm90.h b/src/tl_templates/cuda/copy_sm90.h
@@ -171,6 +171,16 @@ tma_load_im2col(const CUtensorMap &descriptor, BarrierType &smem_mbar,
                : "memory");
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
+TL_DEVICE void tma_store(void *gmem_ptr, void *smem_ptr, uint32_t size) {
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile("cp.async.bulk.global.shared::cta.bulk_group"
+               ".L2::cache_hint [%0], [%1], %2, %3;"
+               :
+               : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(size), "l"(cache_hint)
+               :);
+}
+
 template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_store(const CUtensorMap &descriptor,
                          void const *const smem_ptr, int32_t const &crd0) {
diff --git a/src/transform/inject_tma_barrier.cc b/src/transform/inject_tma_barrier.cc
@@ -62,10 +62,17 @@ class TmaTraitsCollector : public StmtExprVisitor {
 private:
   void VisitExpr_(const CallNode *call) final {
     if (call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col())) {
-      Call access_ptr = Downcast<Call>(call->args[2]);
-      ICHECK(access_ptr->op.same_as(builtin::tvm_access_ptr()));
-      int type_bytes = access_ptr->args[0]->dtype.bytes();
-      bulk_copy_bytes += access_ptr->args[3] * loop_extents * type_bytes;
+      auto arg0 = call->args[0].as<Call>();
+      if (call->op.same_as(tma_load()) && arg0 &&
+          !arg0.value()->op.same_as(create_tma_descriptor())) {
+        // 1D TMA load has tvm_access_ptr of shared tensor in its args[0]
+        bulk_copy_bytes = call->args[3] * loop_extents;
+      } else {
+        Call access_ptr = Downcast<Call>(call->args[2]);
+        ICHECK(access_ptr->op.same_as(builtin::tvm_access_ptr()));
+        int type_bytes = access_ptr->args[0]->dtype.bytes();
+        bulk_copy_bytes += access_ptr->args[3] * loop_extents * type_bytes;
+      }
     }
     StmtExprVisitor::VisitExpr_(call);
   }
@@ -155,10 +162,15 @@ class TmaExpectTxRewriter : public IRMutatorWithAnalyzer {
 
   PrimExpr VisitExpr_(const CallNode *op) {
     if (op->op.same_as(tma_load())) {
+      auto arg0 = op->args[0].as<Call>();
+      bool is_1d_tma_load =
+          arg0 && !arg0.value()->op.same_as(create_tma_descriptor()) &&
+          op->op.same_as(tma_load());
       visited_tma_load_ = true;
       Array<PrimExpr> new_args = op->args;
-      new_args.Set(1, Call(DataType::Handle(), get_mbarrier(),
-                           {IntImm(DataType::Int(32), 0)}));
+      new_args.Set(is_1d_tma_load ? 2 : 1,
+                   Call(DataType::Handle(), get_mbarrier(),
+                        {IntImm(DataType::Int(32), 0)}));
       return Call(op->dtype, op->op, new_args);
     }
     return IRMutatorWithAnalyzer::VisitExpr_(op);
@@ -443,7 +455,14 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
           << "tma_load must be in the tma_op_to_barrier_id_";
       auto barrier_id = tma_op_to_barrier_id_[GetRef<Call>(op)];
       auto new_args = op->args;
-      new_args.Set(1, barrier_id);
+      auto arg0 = op->args[0].as<Call>();
+      auto is_1d_tma_load =
+          arg0 && !arg0.value()->op.same_as(create_tma_descriptor());
+      if (is_1d_tma_load) {
+        new_args.Set(2, barrier_id);
+      } else {
+        new_args.Set(1, barrier_id);
+      }
       return Call(op->dtype, op->op, new_args);
     } else if (op->op.same_as(mbarrier_expect_tx())) {
       ICHECK(tma_op_to_barrier_id_.count(GetRef<Call>(op)))
diff --git a/src/transform/lower_tile_op.cc b/src/transform/lower_tile_op.cc
diff --git a/src/transform/warp_specialized_rewriter.cc b/src/transform/warp_specialized_rewriter.cc