vllm-project
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 5 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp‎
Lines changed: 68 additions & 1 deletion b/‎csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp‎
Lines changed: 68 additions & 1 deletion
diff --git a/‎csrc/moe/moe_align_sum_kernels.cu‎
Lines changed: 37 additions & 15 deletions b/‎csrc/moe/moe_align_sum_kernels.cu‎
Lines changed: 37 additions & 15 deletions
diff --git a/‎csrc/ops.h‎
Lines changed: 1 addition & 2 deletions b/‎csrc/ops.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎csrc/quantization/awq/gemm_kernels.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/quantization/awq/gemm_kernels.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh‎
Lines changed: 9 additions & 4 deletions b/‎csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh‎
Lines changed: 8 additions & 3 deletions b/‎csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh‎
Lines changed: 8 additions & 3 deletions
@@ -19,7 +19,8 @@ repos:
   rev: v2.4.0
   hooks:
   - id: codespell
-    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|vllm/third_party/.*'
+    additional_dependencies: ['tomli']
+    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
   rev: 5.13.2
   hooks:
 
@@ -228,7 +228,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
+  # Please keep this in sync with FetchContent_Declare line below.
+  set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -245,6 +246,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+        # Please keep this in sync with CUTLASS_REVISION line above.
         GIT_TAG v3.7.0
         GIT_PROGRESS TRUE
 
@@ -266,7 +268,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
     "csrc/cutlass_extensions/common.cpp")
 
   set_gencode_flags_for_srcs(
@@ -359,8 +360,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
   # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
-             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -476,7 +476,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
 
@@ -16,6 +16,30 @@ namespace vllm::c3x {
 
 using namespace cute;
 
+template <typename T>
+struct identity {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const { return lhs; }
+};
+
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct TrivialEpilogue {
+ private:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+  using Compute = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::epilogue::thread::Identity, ElementD, ElementAcc,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::fusion::Sm90EVT<Compute, Accum>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  template <typename... Args>
+  static ArgumentType prepare_args(Args... args) {
+    return {};
+  }
+};
+
 /*
  * This class provides the common load descriptors for the
  * ScaledEpilogue[...] classes
@@ -174,6 +198,49 @@ struct ScaledEpilogueBias
   }
 };
 
+/*
+ * This epilogue performs the same operation as ScaledEpilogueBias, but the
+ * bias is a column vector instead of a row vector. Useful e.g. if we are
+ * computing a GEMM via C^T += B^T A^T. This happens in the 2:4 sparse kernels.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueColumnBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template ColLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
 /*
  * This epilogue directly supports per-tensor azp in int32 form.
  * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
@@ -314,4 +381,4 @@ struct ScaledEpilogueBiasAzpToken
   }
 };
 
-};  // namespace vllm::c3x
+};  // namespace vllm::c3x
@@ -198,26 +198,27 @@ __global__ void moe_align_block_size_global_mem_kernel(
 }
 
 // taken from
-// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
+// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
 template <typename scalar_t>
 __global__ void sgl_moe_align_block_size_kernel(
     scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
     int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
     int32_t block_size, size_t numel, int32_t* cumsum) {
   __shared__ int32_t shared_counts[32][8];
-  __shared__ int32_t local_offsets[256];
 
   const int warp_id = threadIdx.x / 32;
-  const int lane_id = threadIdx.x % 32;
   const int experts_per_warp = 8;
   const int my_expert_start = warp_id * experts_per_warp;
 
+  // Initialize shared_counts for this warp's experts
   for (int i = 0; i < experts_per_warp; ++i) {
     if (my_expert_start + i < num_experts) {
       shared_counts[warp_id][i] = 0;
     }
   }
 
+  __syncthreads();
+
   const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
   const size_t start_idx = threadIdx.x * tokens_per_thread;
 
@@ -230,6 +231,7 @@ __global__ void sgl_moe_align_block_size_kernel(
 
   __syncthreads();
 
+  // Single thread computes cumulative sum and total tokens
   if (threadIdx.x == 0) {
     cumsum[0] = 0;
     for (int i = 1; i <= num_experts; ++i) {
@@ -246,19 +248,28 @@ __global__ void sgl_moe_align_block_size_kernel(
 
   __syncthreads();
 
+  // Assign expert IDs to blocks
   if (threadIdx.x < num_experts) {
     for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
          i += block_size) {
       expert_ids[i / block_size] = threadIdx.x;
     }
-    local_offsets[threadIdx.x] = cumsum[threadIdx.x];
   }
+}
 
-  __syncthreads();
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+// taken from
+// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
+template <typename scalar_t>
+__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids,
+                                          int32_t* sorted_token_ids,
+                                          int32_t* cumsum_buffer,
+                                          size_t numel) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = tid; i < numel; i += stride) {
     int32_t expert_id = topk_ids[i];
-    int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1);
+    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
     sorted_token_ids[rank_post_pad] = i;
   }
 }
@@ -377,23 +388,34 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                               torch::Tensor experts_ids,
                               torch::Tensor num_tokens_post_pad) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  TORCH_CHECK(num_experts == 256,
+              "sgl_moe_align_block_size kernel only supports deepseek v3.");
+
   VLLM_DISPATCH_INTEGRAL_TYPES(
       topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
-        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-        // tensors
+        // calc needed amount of shared mem for `cumsum` tensors
         auto options_int =
             torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
-        // torch::Tensor token_cnts_buffer =
-        //     torch::empty({(num_experts + 1) * num_experts}, options_int);
         torch::Tensor cumsum_buffer =
-            torch::empty({num_experts + 1}, options_int);
+            torch::zeros({num_experts + 1}, options_int);
 
-        auto kernel = vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
-        kernel<<<1, 1024, 0, stream>>>(
+        auto align_kernel =
+            vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
+        align_kernel<<<1, 1024, 0, stream>>>(
             topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
             experts_ids.data_ptr<int32_t>(),
             num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
             topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
+
+        const int block_threads = 256;
+        const int num_blocks =
+            (topk_ids.numel() + block_threads - 1) / block_threads;
+        const int max_blocks = 65535;
+        const int actual_blocks = std::min(num_blocks, max_blocks);
+        auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel<scalar_t>;
+        sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
+            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+            cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
       });
 }
 
 
@@ -176,8 +176,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               torch::Tensor const& b_scales,
                               std::optional<torch::Tensor> const& bias);
 
-bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
-                                   torch::Tensor& e, torch::Tensor const& a);
+std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 
@@ -334,7 +334,7 @@ __global__ void __launch_bounds__(64)
   }
 
   // TODO: Shang: Hoist loop invariance.
-  for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) {
+  for (int ax1_0_1 = 0; ax1_0_1 < (N / 32); ++ax1_0_1) {
     for (int local_id = 0; local_id < 8; ++local_id) {
       int row_offset = (((int)blockIdx_y) / j_factors1) * 16 +
                        ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;
 
@@ -53,12 +53,17 @@ struct cutlass_3x_gemm {
 
   using EVTCompute = typename Epilogue::EVTCompute;
 
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD = 4;
+
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAcc, float, ElementC, StrideC, AlignmentCD, ElementD, StrideD,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
       sizeof(typename CollectiveEpilogue::SharedStorage);
@@ -69,8 +74,8 @@ struct cutlass_3x_gemm {
   using CollectiveMainloop =
       typename cutlass::gemm::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentAB, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, 
           ElementAcc, TileShape, ClusterShape,
           Stages,
           KernelSchedule>::CollectiveOp;
 
@@ -103,14 +103,19 @@ struct cutlass_2x_gemm {
 
   using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
 
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD = 4;
+
   // clang-format off
   using RowMajor = typename cutlass::layout::RowMajor;
   using ColumnMajor = typename cutlass::layout::ColumnMajor;
   using KernelType =
     ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
-      float, cutlass::layout::RowMajor, 4,
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      float, cutlass::layout::RowMajor, AlignmentCD,
       ElementAcc, float, cutlass::arch::OpClassTensorOp,
       Arch,
       TileShape, WarpShape, InstructionShape,
Original file line number	Diff line number	Diff line change
`@@ -334,7 +334,7 @@ __global__ void __launch_bounds__(64)`
`334`	`334`	`}`
`335`	`335`
`336`	`336`	`// TODO: Shang: Hoist loop invariance.`
`337`		`- for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) {`
	`337`	`+ for (int ax1_0_1 = 0; ax1_0_1 < (N / 32); ++ax1_0_1) {`
`338`	`338`	`for (int local_id = 0; local_id < 8; ++local_id) {`
`339`	`339`	`int row_offset = (((int)blockIdx_y) / j_factors1) * 16 +`
`340`	`340`	`((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;`