Overload fp8 for implicit conversion

nicunxiao · nicunxiao · commit 5c251475baf7 · 2025-10-23T13:33:55.000+08:00
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -953,23 +953,18 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
     }
   }
 
-  const char *convert_part =
-      (from_ty.is_bfloat16() &&
-       (target_ty.is_float8_e4m3() || target_ty.is_float8_e5m2()))
-          ? ")(half)("
-          : ")(";
-
   // Fallback: elementwise cast
   for (int i = 0, lanes = from_ty.lanes(); i < lanes; ++i) {
     std::ostringstream val;
     val << "(";
     PrintType(target_ty.element_of(), val);
-    val << convert_part;
+    val << ")(";
     PrintVecElemLoad(src, from_ty, i, val);
     val << ")";
     PrintVecElemStore(sret, target_ty, i, val.str());
   }
 
+
   if (used_bf16_op) {
     stream << "#endif\n";
   }
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
@@ -10,6 +10,9 @@
 #include <cutlass/numeric_types.h>
 #include <math_constants.h>
 
+#include <cutlass/float8.h>
+#include <cutlass/bfloat16.h>
+
 using cutlass::bfloat16_t;
 using cutlass::half_t;
 using cutlass::tfloat32_t;
@@ -318,6 +321,27 @@ TL_DEVICE void increase_descriptor_offset(GmmaDescriptor &descriptor,
   descriptor.reg32_[0] += (offset >> 4);
 }
 
+// and add the desired implicit conversion from bfloat16_t.
+struct float_e4m3_t : public cutlass::float_e4m3_t {
+  using cutlass::float_e4m3_t::float_e4m3_t;
+  CUTLASS_HOST_DEVICE
+  float_e4m3_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e4m3_t(__nv_bfloat16 x) : float_e4m3_t(static_cast<float>(x)) {
+  }
+};
+
+struct float_e5m2_t : public cutlass::float_e5m2_t {
+  using cutlass::float_e5m2_t::float_e5m2_t;
+  CUTLASS_HOST_DEVICE
+  float_e5m2_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e5m2_t(__nv_bfloat16 x) : float_e5m2_t(static_cast<float>(x)) {
+  }
+};
+
 } // namespace tl
 
 namespace cutlass {
diff --git a/src/tl_templates/cuda/cuda_fp8.h b/src/tl_templates/cuda/cuda_fp8.h
@@ -2,9 +2,10 @@
 
 #include <cuda_fp8.h>
 #include <cute/numeric/numeric_types.hpp>
+#include "common.h"
 
-using fp8_e4_t = cute::float_e4m3_t;
-using fp8_e5_t = cute::float_e5m2_t;
+using fp8_e4_t = tl::float_e4m3_t;
+using fp8_e5_t = tl::float_e5m2_t;
 
 struct __CUDA_ALIGN__(2) fp8_e4_2_t {
   fp8_e4_t x;

Original file line number	Diff line number	Diff line change
`@@ -953,23 +953,18 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {`
`953`	`953`	`}`
`954`	`954`	`}`
`955`	`955`
`956`		`- const char *convert_part =`
`957`		`- (from_ty.is_bfloat16() &&`
`958`		`- (target_ty.is_float8_e4m3() \|\| target_ty.is_float8_e5m2()))`
`959`		`- ? ")(half)("`
`960`		`- : ")(";`
`961`		`-`
`962`	`956`	`// Fallback: elementwise cast`
`963`	`957`	`for (int i = 0, lanes = from_ty.lanes(); i < lanes; ++i) {`
`964`	`958`	`std::ostringstream val;`
`965`	`959`	`val << "(";`
`966`	`960`	`PrintType(target_ty.element_of(), val);`
`967`		`- val << convert_part;`
	`961`	`+ val << ")(";`
`968`	`962`	`PrintVecElemLoad(src, from_ty, i, val);`
`969`	`963`	`val << ")";`
`970`	`964`	`PrintVecElemStore(sret, target_ty, i, val.str());`
`971`	`965`	`}`
`972`	`966`
	`967`	`+`
`973`	`968`	`if (used_bf16_op) {`
`974`	`969`	`stream << "#endif\n";`
`975`	`970`	`}`