Address feedback 1/N

pavanimajety · pavanimajety · commit fdcf2198636e · 2025-02-03T17:53:54.000-08:00
Correct usage of scaled_fp4_quant to used rounded m / n

Signed-off-by: Pavani Majety &lt;pmajety@nvidia.com&gt;
diff --git a/csrc/quantization/fp4/fp4_gemm.h b/csrc/quantization/fp4/fp4_gemm.h
@@ -18,11 +18,10 @@ class CutlassFp4GemmRunnerInterface {
 
   virtual ~CutlassFp4GemmRunnerInterface() {}
 
-  virtual void cutlass_fp4_gemm(void* D, void const* A, void const* B,
-                                void const* input_sf, void const* weight_sf,
-                                float const* global_sf, int m, int n, int k,
-                                char* workspace, const size_t workspaceBytes,
-                                cudaStream_t stream) = 0;
+  virtual void cutlass_scaled_fp4_mm(
+      void* D, void const* A, void const* B, void const* input_sf,
+      void const* weight_sf, float const* global_sf, int m, int n, int k,
+      char* workspace, const size_t workspaceBytes, cudaStream_t stream) = 0;
 
   // Returns desired workspace size in bytes.
   virtual size_t getWorkspaceSize(int const m, int const n, int const k) = 0;
@@ -36,18 +35,18 @@ class CutlassFp4GemmRunner : public virtual CutlassFp4GemmRunnerInterface {
   CutlassFp4GemmRunner();
   ~CutlassFp4GemmRunner();
 
-  void CutlassFp4GemmRunner::cutlass_fp4_gemm(
+  void CutlassFp4GemmRunner::cutlass_scaled_fp4_mm(
       torch::Tensor& D, torch::Tensor& A, torch::Tensor& B,
       torch::Tensor& input_sf, torch::Tensor& weight_sf,
       torch::Tensor& global_sf, CutlassGemmConfig gemmConfig,
       torch::Tensor& workspace, const size_t workspaceBytes);
 
-  void cutlass_fp4_gemm(void* D, void const* A, void const* B,
-                        void const* input_sf, void const* weight_sf,
-                        float const* global_sf, int m, int n, int k,
-                        CutlassGemmConfig gemmConfig, char* workspace,
-                        const size_t workspaceBytes,
-                        cudaStream_t stream) override;
+  void cutlass_scaled_fp4_mm(void* D, void const* A, void const* B,
+                             void const* input_sf, void const* weight_sf,
+                             float const* global_sf, int m, int n, int k,
+                             CutlassGemmConfig gemmConfig, char* workspace,
+                             const size_t workspaceBytes,
+                             cudaStream_t stream) override;
 
   // Returns desired workspace size in bytes.
   size_t getWorkspaceSize(int const m, int const n, int const k) override;
diff --git a/csrc/quantization/fp4/fp4_gemm_template.cu b/csrc/quantization/fp4/fp4_gemm_template.cu
@@ -298,12 +298,6 @@ size_t dispatchGemmToCutlassSm100(T* D, void const* A, void const* B,
           D, A, B, input_sf, weight_sf, global_sf, m, n, k, gemmConfig,
           workspace, workspaceBytes, stream, Out_dtype, occupancy);
       break;
-    // case CutlassTileConfigSM100::CtaShape256x256x64B:
-    //     return genericFp4GemmKernelLauncher<T, Arch, cute::Shape<cute::_256,
-    //     cute::_256, cute::_64>>(
-    //         D, A, B, input_sf, weight_sf, global_sf, m, n, k, gemmConfig,
-    //         workspace, workspaceBytes, stream, occupancy);
-    //     break;
     case CutlassTileConfigSM100::Undefined:
       throw std::runtime_error(
           "[FP4][dispatch_gemm_to_cutlass] gemm config undefined.");
@@ -447,10 +441,10 @@ void check_input(torch::Tensor& A, c10::ScalarType type, const char* str) {
 
 /// Expectations: A: [m, k] Contiguous dim: k
 ///               B: [n, k] Contiguous dim: k
-void cutlass_fp4_gemm(torch::Tensor& D, torch::Tensor& A, torch::Tensor& B,
-                      torch::Tensor& input_sf, torch::Tensor& weight_sf,
-                      torch::Tensor& global_sf, torch::Tensor& workspace,
-                      const int64_t workspaceBytes) {
+void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor& A, torch::Tensor& B,
+                           torch::Tensor& input_sf, torch::Tensor& weight_sf,
+                           torch::Tensor& global_sf, torch::Tensor& workspace,
+                           const int64_t workspaceBytes) {
   check_input(A, at::ScalarType::Byte, "Matrix A ");
   check_input(B, at::ScalarType::Byte, "Matrix B ");
   check_input(input_sf, at::ScalarType::Float8_e4m3fn,
@@ -535,11 +529,10 @@ __device__ int computeSFIndex(int rowIdx, int colIdx, int totalRow,
          rowGroupIdx * rowGroupStride;
 }
 
-__global__ void blockscale_interleave_fp4_kernel(int8_t* output_ptr,
-                                                 const int8_t* input_ptr,
-                                                 int rows, int cols,
-                                                 int num_experts,
-                                                 int expert_out_size) {
+__global__ void blockscale_interleave_kernel(int8_t* output_ptr,
+                                             const int8_t* input_ptr, int rows,
+                                             int cols, int num_experts,
+                                             int expert_out_size) {
   int eIdx = blockIdx.z;  // Expert index (z-dimension of grid)
   int rIdx = blockIdx.y * blockDim.y + threadIdx.y;  // Row index
   int cIdx = blockIdx.x * blockDim.x + threadIdx.x;  // Column index
@@ -554,9 +547,9 @@ __global__ void blockscale_interleave_fp4_kernel(int8_t* output_ptr,
   }
 }
 
-void blockscale_interleave_fp4(torch::Tensor& output, torch::Tensor& input,
-                               int64_t rows, int64_t cols, int64_t num_experts,
-                               int64_t expert_out_size) {
+void blockscale_interleave(torch::Tensor& output, torch::Tensor& input,
+                           int64_t rows, int64_t cols, int64_t num_experts,
+                           int64_t expert_out_size) {
   auto input_ptr = static_cast<int8_t*>(input.data_ptr());
   auto output_ptr = static_cast<int8_t*>(output.data_ptr());
 
@@ -565,7 +558,7 @@ void blockscale_interleave_fp4(torch::Tensor& output, torch::Tensor& input,
                  (rows + threadsPerBlock.y - 1) / threadsPerBlock.y,
                  num_experts);
 
-  blockscale_interleave_fp4_kernel<<<numBlocks, threadsPerBlock>>>(
+  blockscale_interleave_kernel<<<numBlocks, threadsPerBlock>>>(
       output_ptr, input_ptr, rows, cols, num_experts, expert_out_size);
 
   cudaDeviceSynchronize();
diff --git a/csrc/quantization/fp4/quantization.cu b/csrc/quantization/fp4/quantization.cu
@@ -335,23 +335,21 @@ inline int getMultiProcessorCount() {
   return multi_processor_count;  // Return the cached value on subsequent calls
 }
 
-void quantize_to_fp4(torch::Tensor& output, torch::Tensor& input,
-                     torch::Tensor& output_sf, torch::Tensor& input_sf) {
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor& input,
+                      torch::Tensor& block_scale_out,
+                      torch::Tensor& global_scale) {
   int32_t m = input.size(0);
   int32_t n = input.size(1);
 
-  TORCH_CHECK(n % 16 == 0, "Input dim(1) must be multiple of 16.");
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
 
   int multiProcessorCount = getMultiProcessorCount();
 
-  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
-  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto input_sf_ptr = static_cast<float const*>(global_scale.data_ptr());
+  auto sf_out = static_cast<int32_t*>(block_scale_out.data_ptr());
   auto output_ptr = static_cast<int64_t*>(output.data_ptr());
   at::cuda::CUDAGuard device_guard{(char)input.get_device()};
-  auto stream = at::cuda::getStreamFromPool(false, input.get_device());
-  if (stream == nullptr) {
-    std::cerr << "Warning: Null CUDA stream" << std::endl;
-  }
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
 
   // We don't support e8m0 scales at this moment.
   bool useUE8M0 = false;
@@ -373,7 +371,7 @@ void quantize_to_fp4(torch::Tensor& output, torch::Tensor& input,
       std::cerr << "Observing: " << input.scalar_type()
                 << " for the input datatype which is invalid";
       throw std::runtime_error(
-          "Unsupported input data type for quantize_to_fp4.");
+          "Unsupported input data type for scaled_fp4_quant.");
     }
   }
 }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -390,18 +390,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // fp4 gemm
   ops.def(
-      "cutlass_fp4_gemm(Tensor! out, Tensor! a,"
-      "                 Tensor! b, Tensor! input_sf,"
-      "                 Tensor! weight_sf, Tensor! global_sf,"
+      "cutlass_scaled_fp4_mm(Tensor! out, Tensor! a,"
+      "                 Tensor! b, Tensor! block_scale_a,"
+      "                 Tensor! block_scale_b, Tensor! gscale,"
       "                 Tensor! workspace, int workspace_bytes) -> ()");
-  ops.impl("cutlass_fp4_gemm", torch::kCUDA, &cutlass_fp4_gemm);
+  ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
   // fp4 quantization
   ops.def(
-      "quantize_to_fp4(Tensor! output, Tensor! input,"
-      "                 Tensor! output_sf, Tensor! input_sf,"
-      "                 bool useUE8M0) -> ()");
-  ops.impl("quantize_to_fp4", torch::kCUDA, &quantize_to_fp4);
+      "scaled_fp4_quant(Tensor! output, Tensor! input,"
+      "                 Tensor! block_scale_out, Tensor! input_global_scale"
+      "                 ) -> ()");
+  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
 
 #endif
 
diff --git a/tests/kernels/test_nvfp4_gemm.py b/tests/kernels/test_nvfp4_gemm.py
@@ -161,15 +161,15 @@ def test_nvfp4_gemm(
     b_global_scale = ((448.0 * 6.0) /
                       torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
     alpha = 1 / (a_global_scale * b_global_scale)
-    a_fp4, a_scale_interleaved = ops.quantize_to_fp4(a_dtype, a_global_scale)
-    b_fp4, b_scale_interleaved = ops.quantize_to_fp4(b_dtype, b_global_scale)
+    a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale)
+    b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale)
 
     expected_out = get_ref_results(a_fp4, b_fp4, a_scale_interleaved,
                                    b_scale_interleaved, a_global_scale,
                                    b_global_scale, m, n, dtype, block_size,
                                    device)
-    out = ops.cutlass_fp4_gemm(a_fp4, b_fp4, a_scale_interleaved,
-                               b_scale_interleaved, alpha, dtype)
+    out = ops.cutlass_scaled_fp4_mm(a_fp4, b_fp4, a_scale_interleaved,
+                                    b_scale_interleaved, alpha, dtype)
 
     torch.testing.assert_close(out,
                                expected_out.to(dtype=dtype),
diff --git a/tests/kernels/test_nvfp4_quant.py b/tests/kernels/test_nvfp4_quant.py
@@ -100,7 +100,7 @@ def recover_swizzled_scales(sf, m, n):
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_quantize_to_fp4(
+def test_scaled_fp4_quant(
     dtype: torch.dtype,
     shape: tuple[int, int],
     seed: int,
@@ -116,8 +116,8 @@ def test_quantize_to_fp4(
     global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
     out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
 
-    out, out_sf = ops.quantize_to_fp4(x, global_scale)
-    scale_ans = recover_swizzled_scales(out_sf, m, n)
+    out, block_scale = ops.scaled_fp4_quant(x, global_scale)
+    scale_ans = recover_swizzled_scales(block_scale, m, n)
     out_ans = cast_from_fp4(out, m, n)
 
     torch.testing.assert_close(out_ans, out_ref)
@@ -126,7 +126,7 @@ def test_quantize_to_fp4(
 
 @pytest.mark.parametrize("pad_shape", PAD_SHAPES)
 @torch.inference_mode()
-def test_quantize_to_fp4_padded(pad_shape: tuple[int, int], ) -> None:
+def test_scaled_fp4_quant_padded(pad_shape: tuple[int, int], ) -> None:
     dtype = torch.float16
     current_platform.seed_everything(42)
     torch.set_default_device('cuda:0')
@@ -139,9 +139,9 @@ def test_quantize_to_fp4_padded(pad_shape: tuple[int, int], ) -> None:
     global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
     out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
 
-    out, out_sf = ops.quantize_to_fp4(x, global_scale)
+    out, block_scale = ops.scaled_fp4_quant(x, global_scale)
 
-    scale_ans = recover_swizzled_scales(out_sf, m, n)
+    scale_ans = recover_swizzled_scales(block_scale, m, n)
     out_ans = cast_from_fp4(out, m, n)
 
     torch.testing.assert_close(out_ans, out_ref)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -628,10 +628,12 @@ def cutlass_scaled_sparse_mm(
     return out
 
 
-def cutlass_fp4_gemm(a: torch.Tensor, b: torch.Tensor, input_sf: torch.Tensor,
-                     weight_sf: torch.dtype, global_sf: torch.dtype,
-                     workspace: torch.dtype, workspace_bytes: int,
-                     out_dtype: torch.dtype) -> torch.Tensor:
+# nvfp4
+def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
+                          block_scale_a: torch.Tensor,
+                          block_scale_b: torch.dtype, gscale: torch.dtype,
+                          workspace: torch.dtype, workspace_bytes: int,
+                          out_dtype: torch.dtype) -> torch.Tensor:
     """
     Gemm when a and b have nvfp4 datatype(currently represented as a byte),
     along with their respective block scales and a global scaling factor.
@@ -643,19 +645,80 @@ def cutlass_fp4_gemm(a: torch.Tensor, b: torch.Tensor, input_sf: torch.Tensor,
     n = b.shape[1]
     workspace_bytes = workspace.nbytes
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
-    torch.ops._C.cutlass_fp4_gemm(out, a, b, input_sf, weight_sf, global_sf,
-                                  workspace, workspace_bytes)
+    torch.ops._C.cutlass_scaled_fp4_mm(out, a, b, block_scale_a, block_scale_b,
+                                       gscale, workspace, workspace_bytes)
     return out
 
 
-def quantize_to_fp4(input: torch.Tensor, input_sf: torch.Tensor,
-                    output_sf: torch.Tensor) -> torch.Tensor:
-    assert (input is torch.bfloat16 or input is torch.float16)
-    m = input.shape[0]
-    n = input.shape[1]
-    output = torch.empty((m, n // 2), dtype=torch.uint8, device=input.device)
-    torch.ops._C.quantize_fp4(output, input, input_sf, output_sf, False)
-    return output, output_sf
+def pad_up_fn(x, y):
+    """Pads up x to the nearest multiple of y."""
+    return ((x + y - 1) // y) * y
+
+
+def scaled_fp4_quant(
+        input: torch.Tensor,
+        global_scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantizes the input in FP32/BF16/FP16 to NVFP4 Precision
+    The function returns quantized fp4 tensor and its 
+    corresponding block scale. The 
+    """
+    assert input.ndim >= 1, (
+        f'input.ndim needs to be >= 1, but got {input.ndim}.')
+    other_dims = 1 if input.ndim == 1 else -1
+    input = input.reshape(other_dims, input.shape[-1])
+    m, n = input.shape
+    block_size = 16
+    device = input.device
+
+    assert n % block_size == 0, (
+        f'last dim has to be multiple of 16, but got {n}.')
+    assert input.dtype in (torch.float16, torch.bfloat16), (
+        f'input.dtype needs to be fp16 or bf16 but got {input.dtype}.')
+
+    # Two fp4 values will be packed into an uint8.
+    output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    # We use the rounded values to store the swizzled values. Then, the scaling
+    # factors in float8_e4m3fn are packed into an int32 for every 4 values.
+    rounded_m = pad_up_fn(m, 128)
+    scale_n = n // block_size
+    rounded_n = pad_up_fn(scale_n, 4)
+    block_scale_out = torch.empty((rounded_m, rounded_n // 4),
+                                  device=device,
+                                  dtype=torch.int32)
+    torch.ops._C.scaled_fp4_quant(output, input, block_scale_out, global_scale,
+                                  False)
+    return output, block_scale_out
+
+
+def blockscale_interleave(input: torch.Tensor) -> torch.Tensor:
+    """
+    This method takes in `input` scale and returns an interleaved
+    version of itself. The output `interleaved_block_scale` may
+    return a padded version.   
+    """
+    blockScaleShape = input.size()
+
+    # Check if the tensor is 2D or 3D
+    if len(blockScaleShape) != 2 and len(blockScaleShape) != 3:
+        raise ValueError("Block Scale should be a 2D or 3D tensor.")
+
+    # Extract dimensions based on whether the tensor is 2D or 3D
+    num_experts = blockScaleShape[0] if len(blockScaleShape) == 3 else 1
+    rows = blockScaleShape[1] if len(
+        blockScaleShape) == 3 else blockScaleShape[0]
+    cols = blockScaleShape[2] if len(
+        blockScaleShape) == 3 else blockScaleShape[1]
+
+    expert_out_size = pad_up_fn(rows, 128) * pad_up_fn(cols, 4)
+    interleaved_block_scale = torch.zeros(expert_out_size * num_experts,
+                                          dtype=torch.int8,
+                                          device=input.device)
+
+    torch.ops._C.blockscale_interleave(interleaved_block_scale, input, rows,
+                                       cols, num_experts, expert_out_size)
+    return interleaved_block_scale
 
 
 # aqlm