vllm-project · atalman · Sep 9, 2025 · Sep 10, 2025
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -48,13 +48,13 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.9"
+  - label: "Build wheel - CUDA 13.0"
     depends_on: ~
-    id: build-wheel-cuda-12-9
+    id: build-wheel-cuda-13-0
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -103,7 +103,7 @@ steps:
       - create-multi-arch-manifest
       - build-wheel-cuda-12-8
       - build-wheel-cuda-12-6
-      - build-wheel-cuda-12-9
+      - build-wheel-cuda-13-0
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge

diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
@@ -6,8 +6,15 @@
 
 #ifndef USE_ROCM
   #include <cub/cub.cuh>
+  #if CUB_VERSION >= 300000
+    #include <cuda/std/functional>
+    using AddOp = cuda::std::plus<>;
+  #endif
 #else
   #include <hipcub/hipcub.hpp>
+  #if CUB_VERSION >= 300000
+    using AddOp = cub::Sum;
+  #endif
 #endif
 
 namespace vllm {
@@ -30,7 +37,12 @@ __global__ void rms_norm_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
+  #else
+    variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  #endif
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
@@ -85,7 +97,12 @@ fused_add_rms_norm_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
+  #else
+    variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  #endif
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
@@ -126,7 +143,12 @@ fused_add_rms_norm_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
+  #else
+    variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  #endif
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);

diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
@@ -14,8 +14,15 @@
 
 #ifndef USE_ROCM
   #include <cub/cub.cuh>
+  #if CUB_VERSION >= 300000
+    #include <cuda/std/functional>
+    using AddOp = cuda::std::plus<>;
+  #endif
 #else
   #include <hipcub/hipcub.hpp>
+  #if CUB_VERSION >= 300000
+    using AddOp = cub::Sum;
+  #endif
 #endif
 
 namespace vllm {
@@ -39,7 +46,12 @@ __global__ void rms_norm_static_fp8_quant_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
+  #else
+    variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  #endif
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
@@ -100,7 +112,12 @@ fused_add_rms_norm_static_fp8_quant_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
+  #else
+    variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  #endif
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
@@ -149,7 +166,12 @@ fused_add_rms_norm_static_fp8_quant_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
+  #else
+    variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  #endif
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
@@ -25,11 +25,19 @@
     #include <cub/util_type.cuh>
     #include <cub/cub.cuh>
     #include <cuda/std/functional>
-    using AddOp = cuda::std::plus<float>;
+    #if CUB_VERSION >= 300000
+        using AddOp = cuda::std::plus<>;
+        using MaxOp = cuda::maximum<>;
+    #else
+        using AddOp = cuda::std::plus<float>;
+    #endif
 #else
     #include <hipcub/util_type.hpp>
     #include <hipcub/hipcub.hpp>
-    using AddOp = cub::Sum; 
+    using AddOp = cub::Sum;
+    #if CUB_VERSION >= 300000
+        using MaxOp = cub::Max;
+    #endif
 #endif
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -79,7 +87,11 @@ __launch_bounds__(TPB) __global__
         threadData = max(static_cast<float>(input[idx]), threadData);
     }
 
-    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
+    #if CUB_VERSION >= 300000
+        const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, MaxOp());
+    #else
+        const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
+    #endif
     if (threadIdx.x == 0)
     {
         float_max = maxElem;

diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -13,9 +13,16 @@
 #ifndef USE_ROCM
   #include <cub/cub.cuh>
   #include <cub/util_type.cuh>
+  #if CUB_VERSION >= 300000
+    #include <cuda/std/functional>
+    using MaxOp = cuda::maximum<>;
+  #endif
 #else
   #include <hipcub/hipcub.hpp>
   #include <hipcub/util_type.hpp>
+  #if CUB_VERSION >= 300000
+    using MaxOp = cub::Max;
+  #endif
 #endif
 
 static inline __device__ int8_t float_to_int8_rn(float x) {
@@ -173,7 +180,12 @@ __global__ void dynamic_scaled_int8_quant_kernel(
       });
   using BlockReduce = cub::BlockReduce<float, 256>;
   __shared__ typename BlockReduce::TempStorage tmp;
-  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    float block_max = BlockReduce(tmp).Reduce(thread_max, MaxOp{}, blockDim.x);
+  #else
+    float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
+  #endif
   __shared__ float absmax;
   if (tid == 0) {
     absmax = block_max;

diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
@@ -6,8 +6,15 @@
 
 #ifndef USE_ROCM
   #include <cub/cub.cuh>
+  #if CUB_VERSION >= 300000
+    #include <cuda/std/functional>
+    using MaxOp = cuda::maximum<>;
+  #endif
 #else
   #include <hipcub/hipcub.hpp>
+  #if CUB_VERSION >= 300000
+    using MaxOp = cub::Max;
+  #endif
 #endif
 
 namespace vllm {
@@ -115,8 +122,14 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
 
   using BlockReduce = cub::BlockReduce<float, 256>;
   __shared__ typename BlockReduce::TempStorage tmp;
-  const float block_max =
+
+  #if CUB_VERSION >= 300000
+    const float block_max =
+        BlockReduce(tmp).Reduce(absmax_val, MaxOp{}, blockDim.x);
+  #else
+    const float block_max =
       BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
+  #endif
 
   __shared__ float token_scale;
   if (tid == 0) {

diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -10,8 +10,17 @@
 
 #ifndef USE_ROCM
   #include <cub/cub.cuh>
+  #if CUB_VERSION >= 300000
+    #include <cuda/std/functional>
+    using AddOp = cuda::std::plus<>;
+    using MaxOp = cuda::maximum<>;
+  #endif
 #else
   #include <hipcub/hipcub.hpp>
+  #if CUB_VERSION >= 300000
+    using AddOp = cub::Sum;
+    using MaxOp = cub::Max;
+  #endif
 #endif
 
 namespace vllm {
@@ -36,7 +45,12 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    ss = BlockReduce(reduceStore).Reduce(ss, AddOp{}, blockDim.x);
+  #else
+    ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+  #endif
 
   __shared__ float s_rms;
   if (threadIdx.x == 0) {
@@ -71,9 +85,15 @@ __device__ void compute_dynamic_per_token_scales(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  block_absmax_val_maybe =
-      BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+  #if CUB_VERSION >= 300000
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, MaxOp{}, blockDim.x);
+  #else
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+  #endif
 
   __shared__ float s_token_scale;
   if (threadIdx.x == 0) {
@@ -169,7 +189,12 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    ss = BlockReduce(reduceStore).Reduce(ss, AddOp{}, blockDim.x);
+  #else
+    ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+  #endif
 
   __shared__ float s_rms;
   if (threadIdx.x == 0) {
@@ -238,9 +263,16 @@ __device__ void compute_dynamic_per_token_scales(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  block_absmax_val_maybe =
-      BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  #if CUB_VERSION >= 300000
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, MaxOp{}, blockDim.x);
+  #else
+    block_absmax_val_maybe =
+        BlockReduce(reduceStore)
+            .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+  #endif
 
   __shared__ float s_token_scale;
   if (threadIdx.x == 0) {