Address comments

kaixih · kaixih · commit dae1891696cd · 2025-02-11T01:00:27.000Z
Signed-off-by: kaixih &lt;kaixih@nvidia.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -380,7 +380,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # FP4 Archs and flags
   cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND FP4_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
     set(SRCS 
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
     )
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
@@ -273,7 +273,7 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
   set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
 
   # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
-  # remove x.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
+  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
   set(_CUDA_ARCHS)
   if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
     list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -17,8 +17,9 @@
 #include <torch/all.h>
 
 #if defined ENABLE_NVFP4 && ENABLE_NVFP4
-void scaled_fp4_quant_sm100a(torch::Tensor& output, torch::Tensor const& input,
-                             torch::Tensor& output_sf,
+void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+                             torch::Tensor const& input,
+                             torch::Tensor const& output_sf,
                              torch::Tensor const& input_sf);
 #endif
 
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -22,11 +22,9 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
-#include "cudaUtils.h"
-
 #include <cuda_fp8.h>
 
-namespace vcm = vllm::common;
+#include "cuda_utils.h"
 
 // Get type2 from type or vice versa (applied to half and bfloat16)
 template <typename T>
@@ -336,24 +334,16 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
 
 inline int getMultiProcessorCount() {
   static int multi_processor_count = []() {
-    int device_id = 0;
-    int count = 0;
-
-    // Get the current CUDA device ID
-    vcm::check_cuda_error(cudaGetDevice(&device_id));
-
-    // Get the number of multiprocessors for the current device
-    vcm::check_cuda_error(cudaDeviceGetAttribute(
-        &count, cudaDevAttrMultiProcessorCount, device_id));
-
-    return count;  // Initialize the static variable
+    int64_t count = get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+    return static_cast<int>(count);
   }();
 
   return multi_processor_count;  // Return the cached value on subsequent calls
 }
 
-void scaled_fp4_quant_sm100a(torch::Tensor& output, torch::Tensor const& input,
-                             torch::Tensor& output_sf,
+void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+                             torch::Tensor const& input,
+                             torch::Tensor const& output_sf,
                              torch::Tensor const& input_sf) {
   int32_t m = input.size(0);
   int32_t n = input.size(1);
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -425,8 +425,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Compute NVFP4 block quantized tensor.
   ops.def(
-      "scaled_fp4_quant(Tensor! output, Tensor! input,"
-      "                 Tensor! output_scale, Tensor! input_scale) -> ()");
+      "scaled_fp4_quant(Tensor! output, Tensor input,"
+      "                 Tensor! output_scale, Tensor input_scale) -> ()");
   ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
 
   // Compute int8 quantized tensor for given scaling factor.
diff --git a/tests/kernels/test_nvfp4_quant.py b/tests/kernels/test_nvfp4_quant.py
@@ -4,6 +4,8 @@
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from vllm.utils import round_up
 
 if not current_platform.has_device_capability(100):
     pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
@@ -16,7 +18,7 @@
 SEEDS = [42]
 CUDA_DEVICES = ['cuda:0']
 
-FLOAT4_E2M1_MAX = 6.0
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1fn.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 
 # E2M1 to float
@@ -84,9 +86,9 @@ def ref_nvfp4_quant(x, global_scale):
 
 
 def recover_swizzled_scales(scale, m, n):
-    rounded_m = ((m + 128 - 1) // 128) * 128
+    rounded_m = round_up(m, 128)
     scale_n = n // BLOCK_SIZE
-    rounded_n = ((scale_n + 4 - 1) // 4) * 4
+    rounded_n = round_up(scale_n, 4)
     # Recover the swizzled scaling factor to linear layout
     tmp = torch.reshape(scale, (1, rounded_m // 128, rounded_n // 4, 32, 4, 4))
     tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
@@ -11,6 +11,7 @@
     (0, 15, scalar_types.uint4),
     (-8, 7, scalar_types.uint4b8),
     (-128, 127, scalar_types.uint8b128),
+    (-6., 6., scalar_types.float4_e2m1fn),
     (-28., 28., scalar_types.float6_e3m2f),
     (torch.int8, scalar_types.int8),
     (torch.uint8, scalar_types.uint8),
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -11,6 +11,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType
+from vllm.utils import round_up
 
 logger = init_logger(__name__)
 
@@ -785,7 +786,7 @@ def scaled_fp4_quant(
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
             two values are packed into a uint8 and float8_e4m3 scaling factors
-            in a sizzled layout.
+            in the sizzled layout.
     """
     assert input.ndim >= 1, (
         f'input.ndim needs to be >= 1, but got {input.ndim}.')
@@ -803,11 +804,14 @@ def scaled_fp4_quant(
     # Two fp4 values will be packed into an uint8.
     output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
 
-    # We use the rounded values to store the swizzled values. Then, the scaling
-    # factors in float8_e4m3fn are packed into an int32 for every 4 values.
-    rounded_m = ((m + 128 - 1) // 128) * 128
+    # We use the rounded values to store the swizzled values. Due to the
+    # requirement of the Tensor Core, the minimum tile is 128x4 for the scales.
+    # So, we first pad the scales to multiples of 128 and 4. Then, the scales
+    # (in float8_e4m3fn) are packed into an int32 for every 4 values. More:
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x
+    rounded_m = round_up(m, 128)
     scale_n = n // block_size
-    rounded_n = ((scale_n + 4 - 1) // 4) * 4
+    rounded_n = round_up(scale_n, 4)
     output_scale = torch.empty((rounded_m, rounded_n // 4),
                                device=device,
                                dtype=torch.int32)
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
@@ -321,6 +321,9 @@ class scalar_types:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
 
+    # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float4_e2m1fn = ScalarType.float_(2, 1, True, NanRepr.NONE)
+
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

Original file line number	Diff line number	Diff line change
`@@ -380,7 +380,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")`
`380`	`380`
`381`	`381`	`# FP4 Archs and flags`
`382`	`382`	`cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")`
`383`		`- if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND FP4_ARCHS)`
	`383`	`+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)`
`384`	`384`	`set(SRCS`
`385`	`385`	`"csrc/quantization/fp4/nvfp4_quant_kernels.cu"`
`386`	`386`	`)`