Enable fp8 bias support, and added corresponding tests. Adjusted skinny gemm tests to be zero-centered, to avoid saturation and false passes.

amd-hhashemi · amd-hhashemi · commit faed43bb7273 · 2025-09-20T05:13:44.000Z
Signed-off-by: Hashem Hashemi &lt;hashem.hashemi@amd.com&gt;
diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
@@ -1338,7 +1338,7 @@ template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
     wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const int Bx,
                       const int By, const fp8_t* B, const fp8_t* __restrict__ A,
-                      const fp8_t* __restrict__ BIAS, scalar_t* C,
+                      const scalar_t* __restrict__ BIAS, scalar_t* C,
                       const float* __restrict__ s_A,
                       const float* __restrict__ s_B, const int _WvPrGrp,
                       const int CuCount) {
@@ -1491,8 +1491,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
       for (int n = 0; n < N; n++) {
         for (int y = 0; y < YTILE; y++) {
           // TODO: Determine data type conversion of bias for fp8
-          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA *
-                                                 sB);  // + BIAS[(m+y)%Bx]);
+          scalar_t out = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+          C[m + y + n * M] = BIAS ? (out + BIAS[(m + y) % Bx + (n % By) * M]) : out;
         }
       }
     }
@@ -1506,7 +1506,7 @@ template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
 __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
                                   const int Bx, const int By, const fp8_t* B,
                                   const fp8_t* __restrict__ A,
-                                  const fp8_t* __restrict__ BIAS, scalar_t* C,
+                                  const scalar_t* __restrict__ BIAS, scalar_t* C,
                                   const float* __restrict__ s_A,
                                   const float* __restrict__ s_B,
                                   const int _WvPrGrp, const int CuCount) {
@@ -1520,7 +1520,7 @@ template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
     wvSplitKQ_hf_(const int K, const int Kp, const int M, const int Bx,
                   const int By, const fp8_t* B, const fp8_t* __restrict__ A,
-                  const fp8_t* __restrict__ BIAS, scalar_t* C,
+                  const scalar_t* __restrict__ BIAS, scalar_t* C,
                   const float* __restrict__ s_A, const float* __restrict__ s_B,
                   const int _WvPrGrp, const int CuCount) {
   constexpr int max_lds_len = LDS_SIZE;
@@ -1668,9 +1668,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
       for (int n = 0; n < N; n++) {
         for (int y = 0; y < YTILE; y++) {
           if (y + m >= M) break;  // To avoid mem access fault.
-          // TODO: Determine data type conversion of bias for fp8
-          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA *
-                                                 sB);  // + BIAS[(m+y)%Bx]);
+          scalar_t out = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+          C[m + y + n * M] = BIAS ? (out + BIAS[(m + y) % Bx + (n % By) * M]) : out;
         }
       }
     }
@@ -1684,7 +1683,7 @@ template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
 __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
                               const int Bx, const int By, const fp8_t* B,
                               const fp8_t* __restrict__ A,
-                              const fp8_t* __restrict__ BIAS, scalar_t* C,
+                              const scalar_t* __restrict__ BIAS, scalar_t* C,
                               const float* __restrict__ s_A,
                               const float* __restrict__ s_B, const int _WvPrGrp,
                               const int CuCount) {
@@ -1750,7 +1749,7 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
       auto a_ptr = in_a.data_ptr<fp8_t>();
       auto b_ptr = in_b.data_ptr<fp8_t>();
       auto bias_ptr = (in_bias.has_value() && in_bias->numel() > 0)
-                          ? in_bias->data_ptr<fp8_t>()
+                          ? reinterpret_cast<fptype*>(in_bias->data_ptr())
                           : nullptr;
       switch (N_in) {
         case 1:
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
+import math
 
 import vllm._custom_ops as ops
 from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
@@ -47,6 +48,7 @@
     (2, 512, 512),
     (3, 2048, 2048),
     (4, 4096, 4096),
+    (4, 16400, 2048),
     # Extended FP8 dimensions not covered by WVSPLITK
     (1, 14336, 1024),
     (2, 24576, 2048),
@@ -65,6 +67,8 @@
 @torch.inference_mode()
 def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
     torch.manual_seed(seed)
+    #TODO: Zero-centering the inputs causes errors for LLMM1!
+    #      Without that the numbers quickly saturate, and may be giving false matches.
     A = torch.rand(n, k, dtype=dtype, device="cuda")
     B = torch.rand(m, k, dtype=dtype, device="cuda")
 
@@ -83,8 +87,8 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
     cu_count = current_platform.get_cu_count()
 
-    A = torch.rand(n, k, dtype=dtype, device="cuda")
-    B = torch.rand(m, k, dtype=dtype, device="cuda")
+    A = torch.rand(n, k, dtype=dtype, device="cuda")-.5
+    B = torch.rand(m, k, dtype=dtype, device="cuda")-.5
 
     ref_out = torch.matmul(A, B.t())
     out = ops.wvSplitK(B, A, cu_count)
@@ -101,9 +105,10 @@ def test_rocm_wvsplitk_bias1D_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
     cu_count = current_platform.get_cu_count()
 
-    A = torch.rand(n, k, dtype=dtype, device="cuda")
-    B = torch.rand(m, k, dtype=dtype, device="cuda")
-    BIAS = torch.rand(m, dtype=dtype, device="cuda")
+    xavier = math.sqrt(2/k) # normalize to avoid large output-bias deltas
+    A = (torch.rand(n, k, dtype=dtype, device="cuda")-.5)*xavier
+    B = (torch.rand(m, k, dtype=dtype, device="cuda")-.5)*xavier
+    BIAS = torch.rand(m, dtype=dtype, device="cuda")-.5
 
     ref_out = torch.matmul(A, B.t()) + BIAS
     out = ops.wvSplitK(B, A, cu_count, BIAS)
@@ -120,16 +125,16 @@ def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
     cu_count = current_platform.get_cu_count()
 
-    A = torch.rand(n, k, dtype=dtype, device="cuda")
-    B = torch.rand(m, k, dtype=dtype, device="cuda")
-    BIAS = torch.rand(n, m, dtype=dtype, device="cuda")
+    xavier = math.sqrt(2/k) # normalize to avoid large output-bias deltas
+    A = (torch.rand(n, k, dtype=dtype, device="cuda")-.5)*xavier
+    B = (torch.rand(m, k, dtype=dtype, device="cuda")-.5)*xavier
+    BIAS = torch.rand(n, m, dtype=dtype, device="cuda")-.5
 
     ref_out = torch.matmul(A, B.t()) + BIAS
     out = ops.wvSplitK(B, A, cu_count, BIAS)
 
     assert torch.allclose(out, ref_out, rtol=0.01)
 
-
 @pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
@@ -139,8 +144,8 @@ def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
 def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
 
-    A = torch.rand(n, k, device="cuda")
-    B = torch.rand(m, k, device="cuda")
+    A = torch.rand(n, k, device="cuda")-0.5
+    B = torch.rand(m, k, device="cuda")-0.5
 
     A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
     B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
@@ -154,3 +159,57 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
                         current_platform.get_cu_count())
 
     assert torch.allclose(out, ref_out, rtol=0.01)
+
+@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(
+    not (current_platform.is_rocm() and current_platform.supports_fp8()),
+    reason="only test for rocm fp8")
+def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+
+    xavier = math.sqrt(2/k) # normalize to avoid large output-bias deltas
+    A = (torch.rand(n, k, device="cuda")-.5)*xavier
+    B = (torch.rand(m, k, device="cuda")-.5)*xavier
+    BIAS = (torch.rand(m, dtype=dtype, device="cuda")-.5)
+
+    A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
+    B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
+
+    ref_out = torch._scaled_mm(A,
+                               B.t(),
+                               out_dtype=dtype,
+                               scale_a=scale_a,
+                               scale_b=scale_b) + BIAS
+    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b,
+                        current_platform.get_cu_count(), BIAS)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
+
+@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(
+    not (current_platform.is_rocm() and current_platform.supports_fp8()),
+    reason="only test for rocm fp8")
+def test_rocm_wvsplitk_fp8_bias2D_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+
+    xavier = math.sqrt(2/k) # normalize to avoid large output-bias deltas
+    A = (torch.rand(n, k, device="cuda")-.5)*xavier
+    B = (torch.rand(m, k, device="cuda")-.5)*xavier
+    BIAS = torch.rand(n, m, dtype=dtype, device="cuda")-.5
+
+    A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
+    B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
+
+    ref_out = torch._scaled_mm(A,
+                               B.t(),
+                               out_dtype=dtype,
+                               scale_a=scale_a,
+                               scale_b=scale_b) + BIAS
+    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b,
+                        current_platform.get_cu_count(), BIAS)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -178,10 +178,9 @@ def rocm_per_tensor_w8a8_scaled_mm_impl(qinput: torch.Tensor,
                                         scale_b: torch.Tensor,
                                         bias: torch.Tensor) -> torch.Tensor:
     from vllm.platforms.rocm import on_mi3xx
-    if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx(
-    ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0:
+    if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx() and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0 and bias.dtype == out_dtype :
         output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b,
-                               current_platform.get_cu_count())
+                               current_platform.get_cu_count(), bias)
     else:
         output = torch._scaled_mm(qinput,
                                   weight,