[Feature] Refactor bf16 convertion operations and remove legacy compile flags

Rachmanino · Rachmanino · commit 396cca4194a8 · 2025-10-22T14:51:52.000Z
diff --git a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
@@ -23,8 +23,7 @@ def get_bwd_configs():
     out_idx=[3, 4],
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn_fwd(
         batch,
         heads,
@@ -143,8 +142,7 @@ def flash_fwd(
     out_idx=[2],
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
     accum_dtype = "float"
     shape = [batch, heads, seq_len, dim]
@@ -183,8 +181,7 @@ def make_dq_layout(dQ):
     out_idx=[1],
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
     accum_dtype = "float"
     shape = [batch, heads, seq_len, dim]
@@ -208,8 +205,7 @@ def flash_bwd_post(
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn_bwd(batch,
                   heads,
                   seq_len,
diff --git a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
@@ -26,8 +26,7 @@ def get_configs():
     out_idx=[3],
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn(
     batch,
     heads,
diff --git a/examples/attention_sink/example_mha_sink_bwd_bhsd.py b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
@@ -23,8 +23,7 @@ def get_bwd_configs():
     out_idx=[3, 4],
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn_fwd(
         batch,
         heads,
@@ -140,8 +139,7 @@ def flash_fwd(
     out_idx=[2],
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
     accum_dtype = "float"
     shape = [batch, heads, seq_len, dim]
@@ -180,8 +178,7 @@ def make_dq_layout(dQ):
     out_idx=[1],
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
     accum_dtype = "float"
     shape = [batch, heads, seq_len, dim]
@@ -205,8 +202,7 @@ def flash_bwd_post(
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn_bwd(
     batch,
     heads,
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd.py b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
@@ -21,8 +21,7 @@ def get_configs():
     out_idx=[3],
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn(
         batch,
         heads,
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
@@ -22,8 +22,7 @@ def get_configs():
     out_idx=[3],
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-    compile_flags=["-O3", "-DENABLE_BF16"])
+    })
 def flashattn(
         batch,
         heads,
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -942,6 +942,48 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
     }
   }
 
+  // Handle conversion between bfloat16 and float32
+  if (from_ty.is_bfloat16() && target_ty.is_float()) {
+    // Use __bfloat1622float2 for vectorized conversion (bfloat162 -> float2)
+    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
+      // bfloat162 -> float2
+      PrintIndent();
+      stream << sret << " = __bfloat1622float2(*reinterpret_cast<__nv_bfloat162*>(&(" << src << ")));\n";
+      os << sret;
+      return;
+    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
+      // bfloat162x2 -> float4
+      PrintIndent();
+      stream << "((float2*)(&" << sret << "))[0] = "
+             << "__bfloat1622float2(*reinterpret_cast<__nv_bfloat162*>(&(" << src << ")));\n";
+      PrintIndent();
+      stream << "((float2*)(&" << sret << "))[1] = "
+             << "__bfloat1622float2(*(reinterpret_cast<__nv_bfloat162*>(&(" << src << "))+1));\n";
+      os << sret;
+      return;
+    }
+  } else if (from_ty.is_float() && target_ty.is_bfloat16()) {
+    // Use __float22bfloat162_rn for vectorized conversion (float2 -> bfloat162)
+    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
+      // float2 -> bfloat162
+      PrintIndent();
+      stream << "*reinterpret_cast<__nv_bfloat162*>(&(" << sret << ")) = __float22bfloat162_rn(*(float2*)(&("
+             << src << ")));\n";
+      os << sret;
+      return;
+    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
+      // float4 -> bfloat162x2
+      PrintIndent();
+      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[0] = "
+             << "__float22bfloat162_rn(*(float2*)(&(" << src << ")));\n";
+      PrintIndent();
+      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[1] = "
+             << "__float22bfloat162_rn(*((float2*)(&(" << src << "))+1));\n";
+      os << sret;
+      return;
+    }
+  }
+
   // Handle conversion from float32 to float8 (E4M3/E5M2)
   if (from_ty.is_float() &&
       (target_ty.is_float8_e4m3() || target_ty.is_float8_e5m2())) {
@@ -974,63 +1016,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
     }
   }
 
-  // Handle bfloat16 special cases with supported ops
-  // NOTE(wt): Currently bf16 related ops don't support lanes=4,
-  // We should add this in the future.
-  bool used_bf16_op = false;
-  if (from_ty.is_bfloat16() || target_ty.is_bfloat16()) {
-    std::ostringstream func_name;
-    if (from_ty.is_bfloat16()) {
-      func_name << "bf16";
-    } else if (from_ty.is_float()) {
-      func_name << "float";
-    }
-    if (from_ty.lanes() > 1) {
-      func_name << from_ty.lanes();
-    }
-    func_name << "2";
-    if (target_ty.is_bfloat16()) {
-      func_name << "bf16";
-    } else if (target_ty.is_float()) {
-      func_name << "float";
-    } else if (target_ty == DataType::Int(16)) {
-      func_name << "int16";
-    }
-    if (target_ty.lanes() > 1) {
-      func_name << target_ty.lanes();
-    }
-
-    auto fname = func_name.str();
-    if (bf16_supported_ops_.count(fname)) {
-      used_bf16_op = true;
-      stream << "#ifdef ENABLE_BF16\n";
-      PrintIndent();
-      stream << "reinterpret_cast<";
-      if (target_ty.is_bfloat16()) {
-        stream << "__nv_bfloat16";
-      } else {
-        PrintType(target_ty.element_of(), stream);
-      }
-      if (target_ty.lanes() > 1) {
-        stream << target_ty.lanes();
-      }
-      stream << " &>(" << sret << ") = fastertransformer::" << fname
-             << "(reinterpret_cast<";
-      if (from_ty.is_bfloat16()) {
-        stream << "__nv_bfloat16";
-      } else {
-        PrintType(from_ty.element_of(), stream);
-      }
-      if (from_ty.lanes() > 1) {
-        stream << from_ty.lanes();
-      }
-      stream << " const &>(" << src << "));\n";
-      stream << "#else\n";
-      // bf16 cases don't need early return, as we use elementwise cast as
-      // fallback
-    }
-  }
-
   // Fallback: elementwise cast
   for (int i = 0, lanes = from_ty.lanes(); i < lanes; ++i) {
     std::ostringstream val;
@@ -1042,9 +1027,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
     PrintVecElemStore(sret, target_ty, i, val.str());
   }
 
-  if (used_bf16_op) {
-    stream << "#endif\n";
-  }
   os << sret;
 }
 
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -11,7 +11,7 @@
 }
 
 
-@tilelang.jit(compile_flags=['-DENABLE_BF16'])
+@tilelang.jit
 def vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
     assert M % 256 == 0
 
@@ -57,28 +57,27 @@ def test_vectorized_cast():
     run_vectorized_cast("float32", "float16", "__float22half2_rn", 2)
     run_vectorized_cast("float32", "float16", "__float22half2_rn", 4)
 
-    # # fp16 -> fp32
+    # fp16 -> fp32
     run_vectorized_cast("float16", "float32", "__half22float2", 2)
     run_vectorized_cast("float16", "float32", "__half22float2", 4)
 
-    # # fp32 -> fp8_e4m3
+    # fp32 -> fp8_e4m3
     run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 2)
     run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 4)
 
-    # # fp32 -> fp8_e5m2
+    # fp32 -> fp8_e5m2
     run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 2)
     run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 4)
 
     # fp32 -> bf16
-    # NOTE(wt): currently bf16 related ops don't support lanes=4,
-    # We will add this in the future.
-    run_vectorized_cast("float32", "bfloat16", "fastertransformer", 2)
-    # run_vectorized_cast("float32", "bfloat16", "fastertransformer", 4)
+    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 2)
+    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 4)
 
     # bf16 -> fp32
-    run_vectorized_cast("bfloat16", "float32", "fastertransformer", 2)
-    # run_vectorized_cast("bfloat16", "float32", "fastertransformer", 4)
+    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 2)
+    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 4)
 
 
 if __name__ == "__main__":
     tilelang.testing.main()
+