[Enhancement] Support lanes=4 cases and add unit test for vectorized cast

Rachmanino · Rachmanino · commit 50d4d5d0d75f · 2025-10-22T21:34:18.000+08:00
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -904,19 +904,41 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   if (from_ty.is_float16() && target_ty.is_float()) {
     // Use __half22float2 for vectorized conversion (half2 -> float2)
     if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
+      // half2 -> float2
       PrintIndent();
       stream << sret << " = __half22float2(*(half2*)(&(" << src << ")));\n";
       os << sret;
       return;
+    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
+      // half4 -> float4
+      PrintIndent();
+      stream << "((float2*)(&" << sret << "))[0] = "
+             << "__half22float2(*(half2*)(&(" << src << ")));\n";
+      PrintIndent();
+      stream << "((float2*)(&" << sret << "))[1] = "
+             << "__half22float2(*((half2*)(&(" << src << "))+1));\n";
+      os << sret;
+      return;
     }
   } else if (from_ty.is_float() && target_ty.is_float16()) {
     // Use __float22half2_rn for vectorized conversion (float2 -> half2)
     if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
+      // float2 -> half2
       PrintIndent();
       stream << "*(half2*)(&(" << sret << ")) = __float22half2_rn(*(float2*)(&("
              << src << ")));\n";
       os << sret;
       return;
+    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
+      // float4 -> half4
+      PrintIndent();
+      stream << "((half2*)(&" << sret << "))[0] = "
+             << "__float22half2_rn(*(float2*)(&(" << src << ")));\n";
+      PrintIndent();
+      stream << "((half2*)(&" << sret << "))[1] = "
+             << "__float22half2_rn(*((float2*)(&(" << src << "))+1));\n";
+      os << sret;
+      return;
     }
   }
 
@@ -926,6 +948,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
     // FP32 -> FP8: Use __nv_cvt_float2_to_fp8x2 for vectorized conversion
     // (float2 -> fp8x2)
     if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
+      // float2 -> fp8x2
       PrintIndent();
       stream << "*reinterpret_cast<__nv_fp8x2_storage_t*>(&(" << sret
              << ")) = __nv_cvt_float2_to_fp8x2(*reinterpret_cast<float2*>(&("
@@ -934,10 +957,26 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
              << ");\n";
       os << sret;
       return;
+    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
+      // float4 -> fp8x4
+      PrintIndent();
+      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[0] = "
+             << "__nv_cvt_float2_to_fp8x2(*(float2*)(&(" << src
+             << ")), __NV_SATFINITE, "
+             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      PrintIndent();
+      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[1] = "
+             << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
+             << "))+1), __NV_SATFINITE, "
+             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
     }
   }
 
   // Handle bfloat16 special cases with supported ops
+  // NOTE(wt): Currently bf16 related ops don't support lanes=4,
+  // We should add this in the future.
   bool used_bf16_op = false;
   if (from_ty.is_bfloat16() || target_ty.is_bfloat16()) {
     std::ostringstream func_name;
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -0,0 +1,90 @@
+import torch
+import tilelang.testing
+import tilelang.language as T
+
+
+str2dtype = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "bfloat16": torch.bfloat16,
+    "float8_e4m3": torch.float8_e4m3fn,
+    "float8_e5m2": torch.float8_e5m2,
+}
+
+
+@tilelang.jit(compile_flags=['-DENABLE_BF16'])
+def vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
+    assert M % 256 == 0
+
+    @T.prim_func
+    def main(
+        A: T.Tensor[(M), dtype_A],  # noqa: F821
+        B: T.Tensor[(M), dtype_B],  # noqa: F821
+    ):
+        with T.Kernel(1, threads=128):
+            T.copy(A, B)
+
+    return main
+
+
+def run_vectorized_cast(
+    src_dtype_str: str,
+    dst_dtype_str: str, 
+    check_str: str,
+    lanes: int = 2
+):
+    """Run the vectorized cast kernel and check the correctness.
+    Args:
+        src_dtype_str: The source data type string.
+        dst_dtype_str: The destination data type string.
+        check_str: Used to ensure vectorized cast is used.
+        M: The size of the tensor.
+        lanes: The number of lanes of the source and destination data types.
+    """
+
+    M = 128 * lanes
+    kernel = vectorized_cast_kernel(M, src_dtype_str, dst_dtype_str)
+
+    A = torch.randn(M, dtype=str2dtype[src_dtype_str]).cuda()
+    B = torch.zeros(M, dtype=str2dtype[dst_dtype_str]).cuda()
+
+    kernel(A, B)
+
+    torch.testing.assert_close(A.to(str2dtype[dst_dtype_str]), B)
+
+    code = kernel.get_kernel_source()
+
+    assert check_str in code, \
+        f"Cast {src_dtype_str} to {dst_dtype_str} with {lanes=} is not vectorized!"
+
+
+def test_vectorized_cast():
+    # fp32 -> fp16
+    run_vectorized_cast("float32", "float16", "__float22half2_rn", 2)
+    run_vectorized_cast("float32", "float16", "__float22half2_rn", 4)
+
+    # # fp16 -> fp32
+    run_vectorized_cast("float16", "float32", "__half22float2", 2)
+    run_vectorized_cast("float16", "float32", "__half22float2", 4)
+
+    # # fp32 -> fp8_e4m3
+    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 2)
+    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 4)
+
+    # # fp32 -> fp8_e5m2
+    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 2)
+    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 4)
+
+    # fp32 -> bf16
+    # NOTE(wt): currently bf16 related ops don't support lanes=4,
+    # We will add this in the future.
+    run_vectorized_cast("float32", "bfloat16", "fastertransformer", 2)
+    # run_vectorized_cast("float32", "bfloat16", "fastertransformer", 4)
+
+    # bf16 -> fp32
+    run_vectorized_cast("bfloat16", "float32", "fastertransformer", 2)
+    # run_vectorized_cast("bfloat16", "float32", "fastertransformer", 4)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()