RubiaCx
diff --git a/‎maint/precision/README.md‎
Lines changed: 119 additions & 109 deletions b/‎maint/precision/README.md‎
Lines changed: 119 additions & 109 deletions
diff --git a/‎src/op/builtin.cc‎
Lines changed: 29 additions & 0 deletions b/‎src/op/builtin.cc‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎src/op/builtin.h‎
Lines changed: 26 additions & 0 deletions b/‎src/op/builtin.h‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/target/codegen_cuda.cc‎
Lines changed: 56 additions & 0 deletions b/‎src/target/codegen_cuda.cc‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎testing/python/math/test_ieee_math.py‎
Lines changed: 237 additions & 0 deletions b/‎testing/python/math/test_ieee_math.py‎
Lines changed: 237 additions & 0 deletions
@@ -66,6 +66,35 @@ TIR_DEFINE_TL_BUILTIN(__cos).set_num_inputs(1).set_attr<TCallEffectKind>(
 TIR_DEFINE_TL_BUILTIN(__sin).set_num_inputs(1).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+// high precision with IEEE-compliant
+TIR_DEFINE_TL_BUILTIN(ieee_add).set_num_inputs(3).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(ieee_sub).set_num_inputs(3).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(ieee_mul).set_num_inputs(3).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(ieee_fmaf).set_num_inputs(4).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(ieee_frcp).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(ieee_fsqrt)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(ieee_frsqrt)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(ieee_fdiv).set_num_inputs(3).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
 TIR_DEFINE_TL_BUILTIN(create_list_of_mbarrier)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
 
@@ -90,15 +90,41 @@ static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";
 DataType cuTensorMapType();
 
 // fast math related op
+// __exp(x) - fast exponential
 TVM_DLL const Op &__exp();
+// __exp10(x) - fast base-10 exponential
 TVM_DLL const Op &__exp10();
+// __log(x) - fast natural logarithm
 TVM_DLL const Op &__log();
+// __log2(x) - fast base-2 logarithm
 TVM_DLL const Op &__log2();
+// __log10(x) - fast base-10 logarithm
 TVM_DLL const Op &__log10();
+// __tan(x) - fast tangent
 TVM_DLL const Op &__tan();
+// __cos(x) - fast cosine
 TVM_DLL const Op &__cos();
+// __sin(x) - fast sine
 TVM_DLL const Op &__sin();
 
+// high precision with IEEE-compliant.
+// ieee_add(x, y, rounding_mode) - IEEE-compliant addition
+TVM_DLL const Op &ieee_add();
+// ieee_sub(x, y, rounding_mode) - IEEE-compliant subtraction
+TVM_DLL const Op &ieee_sub();
+// ieee_mul(x, y, rounding_mode) - IEEE-compliant multiplication
+TVM_DLL const Op &ieee_mul();
+// ieee_fmaf(x, y, z, rounding_mode) - IEEE-compliant fused multiply-add
+TVM_DLL const Op &ieee_fmaf();
+// ieee_frcp(x, rounding_mode) - IEEE-compliant reciprocal
+TVM_DLL const Op &ieee_frcp();
+// ieee_fsqrt(x, rounding_mode) - IEEE-compliant square root
+TVM_DLL const Op &ieee_fsqrt();
+// ieee_frsqrt(x) - IEEE-compliant reciprocal square root (rn only)
+TVM_DLL const Op &ieee_frsqrt();
+// ieee_fdiv(x, y, rounding_mode) - IEEE-compliant division
+TVM_DLL const Op &ieee_fdiv();
+
 /*!
  * \brief tvm intrinsics for TMADescriptor creation for tiled load
  *
 
@@ -94,6 +94,18 @@ struct CUDAFastMathTan : public CUDAMath {
   }
 };
 
+struct CUDAIEEEMath {
+  std::string operator()(DataType t, std::string name,
+                         std::string rounding_mode) const {
+    if (t.is_float() && t.bits() == 32) {
+      return "__" + name + "_" + rounding_mode;
+    } else if (t.is_float() && t.bits() == 64) {
+      return "__d" + name + "_" + rounding_mode;
+    }
+    return "";
+  }
+};
+
 static std::string GetFP8Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
@@ -1733,6 +1745,50 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     CUDAFastMath math_func;
     std::string func_name = math_func(op->dtype, "sin");
     os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::ieee_add())) {
+    CUDAIEEEMath math_func;
+    std::string rounding_mode = Downcast<StringImm>(op->args[2])->value;
+    std::string func_name = math_func(op->dtype, "fadd", rounding_mode);
+    os << func_name << "(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::ieee_sub())) {
+    CUDAIEEEMath math_func;
+    std::string rounding_mode = Downcast<StringImm>(op->args[2])->value;
+    std::string func_name = math_func(op->dtype, "fsub", rounding_mode);
+    os << func_name << "(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::ieee_mul())) {
+    CUDAIEEEMath math_func;
+    std::string rounding_mode = Downcast<StringImm>(op->args[2])->value;
+    std::string func_name = math_func(op->dtype, "fmul", rounding_mode);
+    os << func_name << "(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::ieee_fmaf())) {
+    CUDAIEEEMath math_func;
+    std::string rounding_mode = Downcast<StringImm>(op->args[3])->value;
+    std::string func_name = math_func(op->dtype, "fmaf", rounding_mode);
+    os << func_name << "(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2]) << ")";
+  } else if (op->op.same_as(tl::ieee_frcp())) {
+    CUDAIEEEMath math_func;
+    std::string rounding_mode = Downcast<StringImm>(op->args[1])->value;
+    std::string func_name = math_func(op->dtype, "frcp", rounding_mode);
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::ieee_fsqrt())) {
+    CUDAIEEEMath math_func;
+    std::string rounding_mode = Downcast<StringImm>(op->args[1])->value;
+    std::string func_name = math_func(op->dtype, "fsqrt", rounding_mode);
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::ieee_frsqrt())) {
+    CUDAIEEEMath math_func;
+    std::string func_name = math_func(op->dtype, "frsqrt", "rn");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::ieee_fdiv())) {
+    CUDAIEEEMath math_func;
+    std::string rounding_mode = Downcast<StringImm>(op->args[2])->value;
+    std::string func_name = math_func(op->dtype, "fdiv", rounding_mode);
+    os << func_name << "(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
 
@@ -0,0 +1,237 @@
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+import pytest
+
+
+def run_ieee_math_test(mathop_name,
+                       mathop_func,
+                       rounding_mode="rn",
+                       M=128,
+                       N=128,
+                       block_M=32,
+                       block_N=32,
+                       dtype="float32"):
+    """
+    Test IEEE-compliant math operations with specified rounding modes.
+    """
+
+    # Define the appropriate function based on operation type to avoid TVM parsing conflicts
+    if mathop_name == "ieee_fmaf":
+
+        @T.prim_func
+        def main_func(
+                A: T.Tensor((M, N), dtype),
+                B: T.Tensor((M, N), dtype),
+                C: T.Tensor((M, N), dtype),
+                D: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+                for i, j in T.Parallel(block_M, block_N):
+                    D[by * block_M + i,
+                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
+                                                      B[by * block_M + i, bx * block_N + j],
+                                                      C[by * block_M + i,
+                                                        bx * block_N + j], rounding_mode)
+
+        out_idx = [3]
+        num_inputs = 3
+    elif mathop_name in ["ieee_add", "ieee_sub", "ieee_mul", "ieee_fdiv"]:
+
+        @T.prim_func
+        def main_func(
+                A: T.Tensor((M, N), dtype),
+                B: T.Tensor((M, N), dtype),
+                C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+                for i, j in T.Parallel(block_M, block_N):
+                    C[by * block_M + i,
+                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
+                                                      B[by * block_M + i,
+                                                        bx * block_N + j], rounding_mode)
+
+        out_idx = [2]
+        num_inputs = 2
+    else:  # Single argument operations
+
+        @T.prim_func
+        def main_func(
+                A: T.Tensor((M, N), dtype),
+                B: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+                for i, j in T.Parallel(block_M, block_N):
+                    B[by * block_M + i,
+                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
+                                                      rounding_mode)
+
+        out_idx = [1]
+        num_inputs = 1
+
+    # Test compilation
+    kernel = tilelang.compile(
+        main_func,
+        out_idx=out_idx,
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
+        })
+
+    print(f"\n=== Testing {mathop_name} with rounding mode {rounding_mode} ===")
+    print(f"✓ {mathop_name} compilation test passed")
+
+    # Test numerical execution
+    torch_dtype = getattr(torch, dtype)
+    a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
+
+    if num_inputs >= 2:
+        b = torch.randn(M, N, device="cuda", dtype=torch_dtype)
+    if num_inputs == 3:
+        c = torch.randn(M, N, device="cuda", dtype=torch_dtype)
+
+    # Ensure positive values for functions that need them
+    if mathop_name in ["ieee_frcp", "ieee_fsqrt"]:
+        a = torch.abs(a) + 0.1
+    elif mathop_name == "ieee_fdiv":
+        b = torch.abs(b) + 0.1  # Avoid division by zero
+
+    # Execute kernel
+    try:
+        if num_inputs == 1:
+            result = kernel(a)
+        elif num_inputs == 2:
+            result = kernel(a, b)
+        else:  # num_inputs == 3
+            result = kernel(a, b, c)
+
+        assert result is not None
+        print(f"✓ {mathop_name} numerical execution test passed")
+    except Exception as e:
+        print(f"Warning: {mathop_name} execution failed: {e}")
+
+
+def test_rounding_mode_validation():
+    """Test that invalid rounding modes raise ValueError"""
+
+    # Test with invalid rounding mode
+    with pytest.raises(ValueError, match="Invalid rounding mode"):
+        T.ieee_add(1.0, 2.0, "invalid_mode")
+
+    with pytest.raises(ValueError, match="Invalid rounding mode"):
+        T.ieee_mul(1.0, 2.0, "xy")
+
+    with pytest.raises(ValueError, match="Invalid rounding mode"):
+        T.ieee_fsqrt(4.0, "bad_mode")
+
+    print("✓ Rounding mode validation test passed")
+
+
+@tilelang.testing.requires_cuda
+def test_ieee_add_all_rounding_modes():
+    """Test IEEE addition with all rounding modes"""
+    rounding_modes = ["rn", "rz", "ru", "rd"]
+
+    for mode in rounding_modes:
+        run_ieee_math_test("ieee_add", T.ieee_add, rounding_mode=mode)
+        print(f"✓ ieee_add with {mode} passed")
+
+
+@tilelang.testing.requires_cuda
+def test_ieee_sub_all_rounding_modes():
+    """Test IEEE subtraction with all rounding modes"""
+    rounding_modes = ["rn", "rz", "ru", "rd"]
+
+    for mode in rounding_modes:
+        run_ieee_math_test("ieee_sub", T.ieee_sub, rounding_mode=mode)
+        print(f"✓ ieee_sub with {mode} passed")
+
+
+@tilelang.testing.requires_cuda
+def test_ieee_mul_all_rounding_modes():
+    """Test IEEE multiplication with all rounding modes"""
+    rounding_modes = ["rn", "rz", "ru", "rd"]
+
+    for mode in rounding_modes:
+        run_ieee_math_test("ieee_mul", T.ieee_mul, rounding_mode=mode)
+        print(f"✓ ieee_mul with {mode} passed")
+
+
+@tilelang.testing.requires_cuda
+def test_ieee_fmaf_all_rounding_modes():
+    """Test IEEE fused multiply-add with all rounding modes"""
+    rounding_modes = ["rn", "rz", "ru", "rd"]
+
+    for mode in rounding_modes:
+        run_ieee_math_test("ieee_fmaf", T.ieee_fmaf, rounding_mode=mode)
+        print(f"✓ ieee_fmaf with {mode} passed")
+
+
+@tilelang.testing.requires_cuda
+def test_ieee_frcp_all_rounding_modes():
+    """Test IEEE reciprocal with all rounding modes"""
+    rounding_modes = ["rn", "rz", "ru", "rd"]
+
+    for mode in rounding_modes:
+        run_ieee_math_test("ieee_frcp", T.ieee_frcp, rounding_mode=mode)
+        print(f"✓ ieee_frcp with {mode} passed")
+
+
+@tilelang.testing.requires_cuda
+def test_ieee_fsqrt_all_rounding_modes():
+    """Test IEEE square root with all rounding modes"""
+    rounding_modes = ["rn", "rz", "ru", "rd"]
+
+    for mode in rounding_modes:
+        run_ieee_math_test("ieee_fsqrt", T.ieee_fsqrt, rounding_mode=mode)
+        print(f"✓ ieee_fsqrt with {mode} passed")
+
+
+@tilelang.testing.requires_cuda
+def test_ieee_frsqrt_rn_only():
+    """Test IEEE reciprocal square root (round to nearest only)"""
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((128, 128), "float32"),
+            B: T.Tensor((128, 128), "float32"),
+    ):
+        with T.Kernel(T.ceildiv(128, 32), T.ceildiv(128, 32), threads=128) as (bx, by):
+            for i, j in T.Parallel(32, 32):
+                B[by * 32 + i, bx * 32 + j] = T.ieee_frsqrt(A[by * 32 + i, bx * 32 + j])
+
+    kernel = tilelang.compile(
+        main,
+        out_idx=[1],
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
+        })
+
+    print("\n=== Testing ieee_frsqrt (rn only) ===")
+    print("✓ ieee_frsqrt compilation test passed")
+
+    # Test numerical execution
+    a = torch.abs(torch.randn(128, 128, device="cuda", dtype=torch.float32)) + 0.1
+
+    try:
+        result = kernel(a)
+        assert result is not None
+        print("✓ ieee_frsqrt numerical execution test passed")
+    except Exception as e:
+        print(f"Warning: ieee_frsqrt execution failed: {e}")
+
+
+@tilelang.testing.requires_cuda
+def test_ieee_fdiv_all_rounding_modes():
+    """Test IEEE division with all rounding modes"""
+    rounding_modes = ["rn", "rz", "ru", "rd"]
+
+    for mode in rounding_modes:
+        run_ieee_math_test("ieee_fdiv", T.ieee_fdiv, rounding_mode=mode)
+        print(f"✓ ieee_fdiv with {mode} passed")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()