add bfloat16 support for colwise scaling

danielvegamyhre · danielvegamyhre · commit 79e359b799a1 · 2025-07-05T17:14:17.000-07:00
stack-info: PR: #10, branch: danielvegamyhre/stack/4
diff --git a/benchmarks/mx_formats/cast_bench.py b/benchmarks/mx_formats/cast_bench.py
@@ -210,29 +210,9 @@ def run(
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
     elif mode == "dim0_dim1_cuda":
-        x = x.to(torch.float32)
-        y_d0, y_d1, s_d0, s_d1 = mxfp8_cuda.quantize(x, rowwise=True, colwise=True)
-
-        for _ in range(2):
-            __ = mxfp8_cuda.quantize(x, rowwise=True, colwise=True)
-
-        bench_fn = partial(mxfp8_cuda.quantize, rowwise=True, colwise=True)
-        time_us = benchmark_cuda_function_in_microseconds(bench_fn, x)
-
-        assert y_d0.dtype == torch.float8_e4m3fn
-        assert s_d0.dtype == torch.float8_e8m0fnu
-        assert y_d1.dtype == torch.float8_e4m3fn
-        assert s_d1.dtype == torch.float8_e8m0fnu
-
-        bytes_r = x.numel() * bytes_per_el_fp32
-        bytes_w = (
-            sum(t.numel() for t in [y_d0, y_d1, s_d0, s_d1]) * bytes_per_el_fp8
-        )
-        bytes_rw = bytes_r + bytes_w
-        bps = bytes_rw / (time_us / 1e6)
+        raise NotImplementedError("dim0_dim1_cuda not implemented yet")
 
     elif mode == "dim1_cuda":
-        x = x.to(torch.float32)
         _, y_d1, _, s_d1 = mxfp8_cuda.quantize(x, rowwise=False, colwise=True)
 
         for _ in range(2):
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
@@ -481,9 +481,10 @@ def test_triton_mxfp8_dim1_randn(M, K):
 )
 @pytest.mark.parametrize("M", (32,64,2048))
 @pytest.mark.parametrize("K", (32,64,2048))
-def test_cuda_mx_dim1_randn(M, K):
+@pytest.mark.parametrize("input_dtype", (torch.float32,torch.bfloat16))
+def test_cuda_mx_dim1_randn(M, K, input_dtype):
     # Use disinct incrementing values from 0 to M*K-1 to make debugging easier.
-    x = torch.arange(0, M*K, dtype=torch.float32, device="cuda").reshape(M, K).contiguous()
+    x = torch.arange(0, M*K, dtype=input_dtype, device="cuda").reshape(M, K).contiguous()
 
     y_d1_ref, s_d1_ref = triton_to_mxfp8_dim1_reference(x, block_size=32)
     _, y_d1, _, s_d1 = mxfp8_cuda.quantize(x, rowwise=False, colwise=True)
diff --git a/torchao/experimental/mxfp8_cpp/mxfp8_quantize.cuh b/torchao/experimental/mxfp8_cpp/mxfp8_quantize.cuh
@@ -48,18 +48,88 @@ enum class DType {
 
 // Data types
 using e8m0_t = uint8_t;
-using bf16 = nv_bfloat16;
+using bfloat16 = nv_bfloat16;
 using fp8e4m3 = __nv_fp8_e4m3;
 
-// Constants for dtype conversion
+constexpr size_t get_dtype_bits(DType dtype) {
+    switch (dtype) {
+        case DType::kFloat32:
+            return 32;
+        case DType::kBFloat16:
+            return 16;
+        case DType::kFloat8E4M3:
+            return 8;
+        default:
+            // TODO: something smarter than this
+            return 0;
+    }
+}
+
+// FP32 constants
 constexpr int32_t FP32_MANTISSA_BITS = 23;
 constexpr int32_t FP32_EXPONENT_BIAS = 127;
+
+// BF16 constants
+constexpr int32_t BF16_MANTISSA_BITS = 7;
+constexpr int32_t BF16_EXPONENT_BIAS = 127;
+
+// FP8E4M3 constants
 constexpr int32_t F8E4M3_MAX_POW2 = 8;
-constexpr int32_t E8M0_EXPONENT_BIAS= 127;
-constexpr int32_t F32_EXP_BIAS = 127;
 constexpr float F8E4M3_MAX = 448.0;
 
-// Constants for MXFP8
+// FP8E8M0 constants
+constexpr int32_t E8M0_EXPONENT_BIAS= 127;
+
+
+// 1. Base template (for unsupported types)
+template <typename T>
+struct DataTypeTraits {
+    static constexpr bool is_supported = false;
+};
+
+// 2. Specialization for float32
+template <>
+struct DataTypeTraits<float> {
+    static constexpr bool is_supported = true;
+    static constexpr int mantissa_bits = 23;
+    static constexpr int exponent_bias = 127;
+
+    __device__ static __forceinline__ float to_float(const float val) {
+        return val;
+    }
+};
+
+// 3. Specialization for bfloat16
+template <>
+struct DataTypeTraits<nv_bfloat16> {
+    static constexpr bool is_supported = true;
+    static constexpr int mantissa_bits = 7;
+    static constexpr int exponent_bias = 127;
+
+    __device__ static __forceinline__ float to_float(const nv_bfloat16 val) {
+        return __bfloat162float(val);
+    }
+};
+
+__device__ static __forceinline__ e8m0_t calculate_e8m0_biased_scale(const float amax) {
+    // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L239
+    const int32_t int_amax = *reinterpret_cast<const int32_t*>(&amax);
+    const int32_t extracted_pow2 = ((int_amax >> FP32_MANTISSA_BITS) & 0b11111111) - FP32_EXPONENT_BIAS;
+
+    // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L244
+    int32_t scale_unbiased = extracted_pow2 - F8E4M3_MAX_POW2;
+
+    // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L256
+    scale_unbiased = max(scale_unbiased, -E8M0_EXPONENT_BIAS);
+    scale_unbiased = min(scale_unbiased, E8M0_EXPONENT_BIAS + 1);
+    int32_t scale_with_e8m0_bias = scale_unbiased + E8M0_EXPONENT_BIAS;
+
+    // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L261C9-L261C26
+    const e8m0_t e8m0_biased_scale = *reinterpret_cast<e8m0_t*>(&scale_with_e8m0_bias);
+    return e8m0_biased_scale;
+}
+
+// Constants for MXFP8 kernel
 constexpr size_t MXFP8_CHUNK_DIM_Y = 64;
 constexpr size_t MXFP8_CHUNK_DIM_X = 64;
 constexpr size_t MXFP8_CHUNKS_PER_BLOCK_Y = 1;
@@ -343,6 +413,8 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
          (unsigned long long)rows, (unsigned long long)cols, (unsigned long long)scales_rowwise_stride_dim0, (unsigned long long)scales_rowwise_stride_dim1, (unsigned long long)scales_colwise_stride_dim0, (unsigned long long)scales_colwise_stride_dim1);
 #endif
 
+  static_assert(DataTypeTraits<IType>::is_supported, "Input data type is not supported by this kernel.");
+
   constexpr bool USE_ROWWISE_SCALING = SCALE_DIM_X > 1;
   constexpr bool USE_COLWISE_SCALING = SCALE_DIM_Y > 1;
 
@@ -505,7 +577,7 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
             const bool out_of_bounds = (col_out_of_bounds || row_out_of_bounds);
 
             // Load from shared memory into thread local registers.
-            float elt = static_cast<float>(in.data.elt[j]);
+            float elt = DataTypeTraits<IType>::to_float(in.data.elt[j]);
             in_compute[j] = elt;
 
             // Update thread local amax.
@@ -564,7 +636,7 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
           const bool out_of_bounds = (col_out_of_bounds || row_out_of_bounds);
 
           // Load from shared memory into thread local registers.
-          float elt = static_cast<float>(in_sh[buff][i][tid_colwise_X]);
+          float elt = DataTypeTraits<IType>::to_float(in_sh[buff][i][tid_colwise_X]);
           in_compute[i] = elt;
 
           // Update thread local amax.
@@ -580,55 +652,28 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
         // ******* END TE original ***********
 
         // ******* Updated implementation based on torchao to_mx() with ScaleCalculationMode=FLOOR **********
-        // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L239
-        const int32_t int_amax = *reinterpret_cast<const int32_t*>(&amax);
-        const int32_t extracted_pow2 = ((int_amax >> FP32_MANTISSA_BITS) & 0b11111111) - FP32_EXPONENT_BIAS;
-
-        // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L244
-        int32_t scale_unbiased = extracted_pow2 - F8E4M3_MAX_POW2;
-
-        // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L256
-        scale_unbiased = max(scale_unbiased, -E8M0_EXPONENT_BIAS);
-        scale_unbiased = min(scale_unbiased, E8M0_EXPONENT_BIAS + 1);
-        int32_t scale_with_e8m0_bias = scale_unbiased + E8M0_EXPONENT_BIAS;
-
-        // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L261C9-L261C26
-        const e8m0_t e8m0_biased_scale = *reinterpret_cast<e8m0_t*>(&scale_with_e8m0_bias);
+        const e8m0_t e8m0_biased_scale = calculate_e8m0_biased_scale(amax);
 
         // Calculate scale offsets and write scaling factor.
         const int global_scales_offset_Y = scales_colwise_chunk_offset_Y + iter;
         const int global_scales_offset_X = scales_colwise_chunk_offset_X + tid_colwise_X;
         const int scale_idx = global_scales_offset_Y * scales_colwise_stride_dim1 + global_scales_offset_X * scales_colwise_stride_dim0;
 
-        // Debug logging
-#if defined(DEBUG)
-        printf("tid_colwise_X=%llu, scales_colwise_stride_dim0=%d, global_scales_offset_Y=%llu, global_scales_offset_X=%llu, scale_idx=%llu, amax=%d, extracted_pow_2=%d, scale_unbiased=%d, scale_with_e8m0_bias=%d,  e8m0_biased_scale=%d, col_out_of_bounds=%d\n",
-          (unsigned long long)tid_colwise_X,
-          (unsigned long long)global_scales_offset_Y,
-          (unsigned long long)global_scales_offset_X,
-          (unsigned long long)scale_idx,
-          (int)(amax),
-          extracted_pow2,
-          scale_unbiased,
-          scale_with_e8m0_bias,
-          e8m0_biased_scale,
-          col_out_of_bounds);
-#endif
-
         // Write scales to global memory.
         // I had to add this bounds check because the original code was having threads from the second `iter` overwrite values from the first.
         const bool row_out_of_bounds = (row_base >= rows);
         if (!row_out_of_bounds && !col_out_of_bounds) {
           scales_colwise[scale_idx] = e8m0_biased_scale;
         }
 
+        // Apply scales to do value conversion.
         // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L275C1-L277C30
         int32_t exponent_as_int32 = static_cast<int32_t>(e8m0_biased_scale);
         int32_t float_bits = exponent_as_int32 << FP32_MANTISSA_BITS;
         float scale_fp32 = *reinterpret_cast<float*>(&float_bits);
 
         // torchao ref: https://github.com/pytorch/ao/blob/00417b8b33abb75c54cdb347bd320fb6ac0a4d94/torchao/prototype/mx_formats/mx_tensor.py#L286
-        const float F32_MIN_NORMAL = exp2f(-F32_EXP_BIAS + 1);
+        const float F32_MIN_NORMAL = exp2f(-FP32_EXPONENT_BIAS + 1);
         scale_fp32 = max(scale_fp32, F32_MIN_NORMAL);
 
         // Use scales to perform value conversion.
@@ -743,35 +788,42 @@ public:
     printf("grid.x=%d, grid.y=%d, block.x=%d, block.y=%d\n", grid.x, grid.y, block.x, block.y);
 #endif
 
+
     // Create TMA descriptors
     alignas(64) CUtensorMap tensor_map_input{};
     alignas(64) CUtensorMap tensor_map_output_rowwise{};
     alignas(64) CUtensorMap tensor_map_output_colwise{};
+    int32_t input_bits_per_elem = get_dtype_bits(input_dtype);
+    int32_t output_bits_per_elem = get_dtype_bits(output_dtype);
+
+#if defined(DEBUG)
+    printf("input_bits_per_elem=%d, output_bits_per_elem=%d\n", input_bits_per_elem, output_bits_per_elem);
+#endif
 
     create_2D_tensor_map(tensor_map_input,
                          const_cast<void *>(input),
                          input_dtype,
                          rows, cols,
                          MXFP8_SHMEM_DIM_Y, MXFP8_SHMEM_DIM_X,
-                         cols,              // input stride along dim0
-                         32);               // bits per elem in input
+                         cols,                      // input stride along dim0
+                         input_bits_per_elem);      // bits per elem in input
 
     if (output_rowwise) {
       create_2D_tensor_map(tensor_map_output_rowwise, output_rowwise,
                            output_dtype,
                            rows, cols,
                            MXFP8_SHMEM_DIM_Y, MXFP8_SHMEM_DIM_X,
-                           cols,            // input stride along dim0
-                           8);              // bits per elem in output fp8e4m3
+                           cols,                    // input stride along dim0
+                           output_bits_per_elem);   // bits per elem in output fp8e4m3
     }
 
     if (output_colwise) {
       create_2D_tensor_map(tensor_map_output_colwise, output_colwise,
                            output_dtype,
                            rows, cols,
                            MXFP8_SHMEM_DIM_Y, MXFP8_SHMEM_DIM_X,
-                           cols,            // input stride along dim0
-                           8);              // bits per elem in output fp8e4m3
+                           cols,                    // input stride along dim0
+                           output_bits_per_elem);   // bits per elem in output fp8e4m3
     }
 
 // Launch kernel based on input/output types and scaling dimensions
@@ -807,6 +859,14 @@ public:
       } else if (scale_dim_x == 1 && scale_dim_y == 32) {
         LAUNCH_KERNEL(float, fp8e4m3, 32, 1);
       }
+    } else if (input_dtype == DType::kBFloat16) {
+      if (scale_dim_x == 32 && scale_dim_y == 32) {
+        LAUNCH_KERNEL(bfloat16, fp8e4m3, 32, 32);
+      } else if (scale_dim_x == 32 && scale_dim_y == 1) {
+        LAUNCH_KERNEL(bfloat16, fp8e4m3, 1, 32);
+      } else if (scale_dim_x == 1 && scale_dim_y == 32) {
+        LAUNCH_KERNEL(bfloat16, fp8e4m3, 32, 1);
+      }
     } else {
       printf("unsupported input dtype, must be float32\n");
       exit(1);