[Feature] Add float32 to float8 conversion support in CUDA codegen

Rachmanino · Rachmanino · commit d44ee95b4932 · 2025-10-21T16:49:42.000Z
* Implemented handling for conversion from float32 to float8 (E4M3/E5M2) in the VisitExpr_ method.
* Added vectorized conversion support using __nv_cvt_float2_to_fp8x2 for float2 to fp8x2 transformations.
* Enhanced type handling for better compatibility with TileLang, particularly for float8 types.
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -920,6 +920,19 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
     }
   }
 
+  // Handle conversion from float32 to float8 (E4M3/E5M2)
+  if (from_ty.is_float() && target_ty.is_float8()) {
+    // FP32 -> FP8: Use __nv_cvt_float2_to_fp8x2 for vectorized conversion (float2 -> fp8x2)
+    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
+      PrintIndent();
+      stream << "*reinterpret_cast<__nv_fp8x2_storage_t*>(&(" << sret << ")) = __nv_cvt_float2_to_fp8x2(*reinterpret_cast<float2*>(&(" << src
+             << ")), __NV_SATFINITE, "
+             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
+      os << sret;
+      return;
+    } 
+  }
+
   // Handle bfloat16 special cases with supported ops
   bool used_bf16_op = false;
   if (from_ty.is_bfloat16() || target_ty.is_bfloat16()) {
@@ -970,6 +983,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
       }
       stream << " const &>(" << src << "));\n";
       stream << "#else\n";
+      // bf16 cases don't need early return, as we use elementwise cast as fallback
     }
   }
 

Original file line number	Diff line number	Diff line change
`@@ -920,6 +920,19 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {`
`920`	`920`	`}`
`921`	`921`	`}`
`922`	`922`
	`923`	`+ // Handle conversion from float32 to float8 (E4M3/E5M2)`
	`924`	`+ if (from_ty.is_float() && target_ty.is_float8()) {`
	`925`	`+ // FP32 -> FP8: Use __nv_cvt_float2_to_fp8x2 for vectorized conversion (float2 -> fp8x2)`
	`926`	`+ if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {`
	`927`	`+ PrintIndent();`
	`928`	`+ stream << "reinterpret_cast<__nv_fp8x2_storage_t>(&(" << sret << ")) = __nv_cvt_float2_to_fp8x2(reinterpret_cast<float2>(&(" << src`
	`929`	`+ << ")), __NV_SATFINITE, "`
	`930`	`+ << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2") << ");\n";`
	`931`	`+ os << sret;`
	`932`	`+ return;`
	`933`	`+ }`
	`934`	`+ }`
	`935`	`+`
`923`	`936`	`// Handle bfloat16 special cases with supported ops`
`924`	`937`	`bool used_bf16_op = false;`
`925`	`938`	`if (from_ty.is_bfloat16() \|\| target_ty.is_bfloat16()) {`
`@@ -970,6 +983,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {`
`970`	`983`	`}`
`971`	`984`	`stream << " const &>(" << src << "));\n";`
`972`	`985`	`stream << "#else\n";`
	`986`	`+ // bf16 cases don't need early return, as we use elementwise cast as fallback`
`973`	`987`	`}`
`974`	`988`	`}`
`975`	`989`