fix: Reinterpret types to cute types in GEMM

nicunxiao · nicunxiao · commit e44875414510 · 2025-10-23T15:14:14.000+08:00
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
@@ -322,8 +322,8 @@ TL_DEVICE void increase_descriptor_offset(GmmaDescriptor &descriptor,
 }
 
 // and add the desired implicit conversion from bfloat16_t.
-struct float_e4m3_t : public cutlass::float_e4m3_t {
-  using cutlass::float_e4m3_t::float_e4m3_t;
+struct float_e4m3_t : public cute::float_e4m3_t {
+  using cute::float_e4m3_t::float_e4m3_t;
   CUTLASS_HOST_DEVICE
   float_e4m3_t() = default;
 
@@ -332,8 +332,8 @@ struct float_e4m3_t : public cutlass::float_e4m3_t {
       : float_e4m3_t(static_cast<float>(x)) {}
 };
 
-struct float_e5m2_t : public cutlass::float_e5m2_t {
-  using cutlass::float_e5m2_t::float_e5m2_t;
+struct float_e5m2_t : public cute::float_e5m2_t {
+  using cute::float_e5m2_t::float_e5m2_t;
   CUTLASS_HOST_DEVICE
   float_e5m2_t() = default;
 
diff --git a/src/tl_templates/cuda/gemm_mma.h b/src/tl_templates/cuda/gemm_mma.h
@@ -257,18 +257,24 @@ struct OperandTraits<64, N, K, false, num_warp_n, leading_dim,
   using Copy = DefaultCopy;
 };
 
+template<typename T> struct to_cute_type {using type = T;};
+template<> struct to_cute_type<tl::float_e4m3_t> {using type = cute::float_e4m3_t;};
+template<> struct to_cute_type<tl::float_e5m2_t> {using type = cute::float_e5m2_t;};
+
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
           int offset_b, typename A_type_raw, typename B_type_raw,
           typename C_type_raw>
 class GemmTensorOp {
 public:
+  using A_type_cute = typename to_cute_type<A_type_raw>::type;
+  using B_type_cute = typename to_cute_type<B_type_raw>::type;
   using A_type =
-      typename std::conditional<std::is_same<A_type_raw, float>::value,
+      typename std::conditional<std::is_same<A_type_cute, float>::value,
                                 tfloat32_t, A_type_raw>::type;
   using B_type =
       typename std::conditional<std::is_same<B_type_raw, float>::value,
-                                tfloat32_t, A_type_raw>::type;
+                                tfloat32_t, B_type_cute>::type;
   using C_type = C_type_raw;
 
   using Instruction =
diff --git a/src/tl_templates/cuda/gemm_sm90.h b/src/tl_templates/cuda/gemm_sm90.h
@@ -15,16 +15,21 @@ using namespace SM90;
 namespace tl_wgmma {
 
 using namespace cutlass::gemm::collective::detail; // ss_smem_selector
+template<typename T> struct to_cute_type {using type = T;};
+template<> struct to_cute_type<tl::float_e4m3_t> {using type = cute::float_e4m3_t;};
+template<> struct to_cute_type<tl::float_e5m2_t> {using type = cute::float_e5m2_t;};
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum, typename A_type_raw,
           typename B_type_raw, typename C_type_raw>
 class GemmTensorOp {
 public:
-  using A_type = conditional_t<std::is_same<A_type_raw, float>::value,
-                               tfloat32_t, A_type_raw>;
-  using B_type = conditional_t<std::is_same<B_type_raw, float>::value,
-                               tfloat32_t, B_type_raw>;
+  using A_type_cute = typename to_cute_type<A_type_raw>::type;
+  using B_type_cute = typename to_cute_type<B_type_raw>::type;
+  using A_type = conditional_t<std::is_same<A_type_cute, float>::value,
+                               tfloat32_t, A_type_cute>;
+  using B_type = conditional_t<std::is_same<B_type_cute, float>::value,
+                               tfloat32_t, A_type_cute>;
   using C_type = C_type_raw;
 
   static constexpr GMMA::Major GmmaMajorA =