vllm-project
diff --git a/‎CMakeLists.txt‎
Lines changed: 18 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎cmake/utils.cmake‎
Lines changed: 13 additions & 5 deletions b/‎cmake/utils.cmake‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎csrc/ops.h‎
Lines changed: 4 additions & 0 deletions b/‎csrc/ops.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎csrc/quantization/fp4/cudaUtils.h‎
Lines changed: 67 additions & 0 deletions b/‎csrc/quantization/fp4/cudaUtils.h‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎csrc/quantization/fp4/nvfp4_quant_entry.cu‎
Lines changed: 31 additions & 0 deletions b/‎csrc/quantization/fp4/nvfp4_quant_entry.cu‎
Lines changed: 31 additions & 0 deletions
@@ -264,6 +264,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
     "csrc/sparse/cutlass/sparse_compressor_entry.cu"
     "csrc/cutlass_extensions/common.cpp")
@@ -377,6 +378,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  # FP4 Archs and flags
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND FP4_ARCHS)
+    set(SRCS 
+      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${FP4_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
+    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
+  else()
+    message(STATUS "Not building NVFP4 as no compatible archs were found.")
+    # clear FP4_ARCHS
+    set(FP4_ARCHS)
+  endif()
 
   #
   # Machete kernels
 
@@ -257,9 +257,9 @@ endmacro()
 #  where `<=` is the version comparison operator.
 # In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
-# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
-#  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
-#  9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). 
+# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
+#  in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
+#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). 
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@@ -272,8 +272,8 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
   list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
   set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
 
-  # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
-  # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
+  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # remove x.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
   set(_CUDA_ARCHS)
   if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
     list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
@@ -283,6 +283,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
     endif()
   endif()
 
+  if ("10.0a" IN_LIST SRC_CUDA_ARCHS)
+    list(REMOVE_ITEM SRC_CUDA_ARCHS "10.0a")
+    if ("10.0" IN_LIST TGT_CUDA_ARCHS)
+      list(REMOVE_ITEM TGT_CUDA_ARCHS_ "10.0")
+      set(_CUDA_ARCHS "10.0a")
+    endif()
+  endif()
+
   list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
 
   # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
 
@@ -195,6 +195,10 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
 
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_scale,
+                      torch::Tensor const& input_scale);
+
 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
                              torch::Tensor const& scale);
 
 
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+
+namespace vllm {
+namespace common {
+
+class CudaException : public std::runtime_error {
+ public:
+  CudaException(const std::string& file, int line, const std::string& message)
+      : std::runtime_error("CUDA Error at " + file + ":" +
+                           std::to_string(line) + " - " + message) {}
+};
+
+template <typename T>
+void check(T result, const char* func, const char* file, int line) {
+  if (result != cudaSuccess) {
+    throw CudaException(
+        file, line,
+        std::string("[VLLM][ERROR] CUDA runtime error in ") + func + ": " +
+            cudaGetErrorString(static_cast<cudaError_t>(result)));
+  }
+}
+
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+
+inline int getMaxSharedMemoryPerBlockOptin() {
+  int device_id;
+  int max_shared_memory_per_block;
+  check_cuda_error(cudaGetDevice(&device_id));
+  check_cuda_error(cudaDeviceGetAttribute(
+      &max_shared_memory_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
+      device_id));
+  return max_shared_memory_per_block;
+}
+
+inline int getSMVersion() {
+  int device{-1};
+  check_cuda_error(cudaGetDevice(&device));
+  int sm_major = 0;
+  int sm_minor = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &sm_major, cudaDevAttrComputeCapabilityMajor, device));
+  check_cuda_error(cudaDeviceGetAttribute(
+      &sm_minor, cudaDevAttrComputeCapabilityMinor, device));
+  return sm_major * 10 + sm_minor;
+}
+
+}  // namespace common
+}  // namespace vllm
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void scaled_fp4_quant_sm100a(torch::Tensor& output, torch::Tensor const& input,
+                             torch::Tensor& output_sf,
+                             torch::Tensor const& input_sf);
+#endif
+
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization");
+}