Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@ steps:
DOCKER_BUILDKIT: "1"

# x86 + CUDA builds
- label: "Build wheel - CUDA 12.9"
- label: "Build wheel - CUDA 13.0"
depends_on: ~
id: build-wheel-cuda-12-9
id: build-wheel-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
Expand Down Expand Up @@ -103,7 +103,7 @@ steps:
- create-multi-arch-manifest
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-12-9
- build-wheel-cuda-13-0
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
Expand Down
28 changes: 25 additions & 3 deletions csrc/layernorm_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,15 @@

#ifndef USE_ROCM
#include <cub/cub.cuh>
#if CUB_VERSION >= 300000
#include <cuda/std/functional>
using AddOp = cuda::std::plus<>;
#endif
#else
#include <hipcub/hipcub.hpp>
#if CUB_VERSION >= 300000
using AddOp = cub::Sum;
#endif
#endif

namespace vllm {
Expand All @@ -30,7 +37,12 @@ __global__ void rms_norm_kernel(

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

#if CUB_VERSION >= 300000
variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
#else
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
#endif

if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
Expand Down Expand Up @@ -85,7 +97,12 @@ fused_add_rms_norm_kernel(

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

#if CUB_VERSION >= 300000
variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
#else
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
#endif

if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
Expand Down Expand Up @@ -126,7 +143,12 @@ fused_add_rms_norm_kernel(

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

#if CUB_VERSION >= 300000
variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
#else
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
#endif

if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
Expand Down
28 changes: 25 additions & 3 deletions csrc/layernorm_quant_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,15 @@

#ifndef USE_ROCM
#include <cub/cub.cuh>
#if CUB_VERSION >= 300000
#include <cuda/std/functional>
using AddOp = cuda::std::plus<>;
#endif
#else
#include <hipcub/hipcub.hpp>
#if CUB_VERSION >= 300000
using AddOp = cub::Sum;
#endif
#endif

namespace vllm {
Expand All @@ -39,7 +46,12 @@ __global__ void rms_norm_static_fp8_quant_kernel(

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

#if CUB_VERSION >= 300000
variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
#else
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
#endif

if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
Expand Down Expand Up @@ -100,7 +112,12 @@ fused_add_rms_norm_static_fp8_quant_kernel(

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

#if CUB_VERSION >= 300000
variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
#else
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
#endif

if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
Expand Down Expand Up @@ -149,7 +166,12 @@ fused_add_rms_norm_static_fp8_quant_kernel(

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);

#if CUB_VERSION >= 300000
variance = BlockReduce(reduceStore).Reduce(variance, AddOp{}, blockDim.x);
#else
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
#endif

if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
Expand Down
18 changes: 15 additions & 3 deletions csrc/moe/topk_softmax_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,19 @@
#include <cub/util_type.cuh>
#include <cub/cub.cuh>
#include <cuda/std/functional>
using AddOp = cuda::std::plus<float>;
#if CUB_VERSION >= 300000
using AddOp = cuda::std::plus<>;
using MaxOp = cuda::maximum<>;
#else
using AddOp = cuda::std::plus<float>;
#endif
#else
#include <hipcub/util_type.hpp>
#include <hipcub/hipcub.hpp>
using AddOp = cub::Sum;
using AddOp = cub::Sum;
#if CUB_VERSION >= 300000
using MaxOp = cub::Max;
#endif
#endif

#define MAX(a, b) ((a) > (b) ? (a) : (b))
Expand Down Expand Up @@ -79,7 +87,11 @@ __launch_bounds__(TPB) __global__
threadData = max(static_cast<float>(input[idx]), threadData);
}

const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
#if CUB_VERSION >= 300000
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, MaxOp());
#else
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
#endif
if (threadIdx.x == 0)
{
float_max = maxElem;
Expand Down
14 changes: 13 additions & 1 deletion csrc/quantization/compressed_tensors/int8_quant_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,16 @@
#ifndef USE_ROCM
#include <cub/cub.cuh>
#include <cub/util_type.cuh>
#if CUB_VERSION >= 300000
#include <cuda/std/functional>
using MaxOp = cuda::maximum<>;
#endif
#else
#include <hipcub/hipcub.hpp>
#include <hipcub/util_type.hpp>
#if CUB_VERSION >= 300000
using MaxOp = cub::Max;
#endif
#endif

static inline __device__ int8_t float_to_int8_rn(float x) {
Expand Down Expand Up @@ -173,7 +180,12 @@ __global__ void dynamic_scaled_int8_quant_kernel(
});
using BlockReduce = cub::BlockReduce<float, 256>;
__shared__ typename BlockReduce::TempStorage tmp;
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);

#if CUB_VERSION >= 300000
float block_max = BlockReduce(tmp).Reduce(thread_max, MaxOp{}, blockDim.x);
#else
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
#endif
__shared__ float absmax;
if (tid == 0) {
absmax = block_max;
Expand Down
15 changes: 14 additions & 1 deletion csrc/quantization/fp8/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,15 @@

#ifndef USE_ROCM
#include <cub/cub.cuh>
#if CUB_VERSION >= 300000
#include <cuda/std/functional>
using MaxOp = cuda::maximum<>;
#endif
#else
#include <hipcub/hipcub.hpp>
#if CUB_VERSION >= 300000
using MaxOp = cub::Max;
#endif
#endif

namespace vllm {
Expand Down Expand Up @@ -115,8 +122,14 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(

using BlockReduce = cub::BlockReduce<float, 256>;
__shared__ typename BlockReduce::TempStorage tmp;
const float block_max =

#if CUB_VERSION >= 300000
const float block_max =
BlockReduce(tmp).Reduce(absmax_val, MaxOp{}, blockDim.x);
#else
const float block_max =
BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
#endif

__shared__ float token_scale;
if (tid == 0) {
Expand Down
48 changes: 40 additions & 8 deletions csrc/quantization/fused_kernels/layernorm_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,17 @@

#ifndef USE_ROCM
#include <cub/cub.cuh>
#if CUB_VERSION >= 300000
#include <cuda/std/functional>
using AddOp = cuda::std::plus<>;
using MaxOp = cuda::maximum<>;
#endif
#else
#include <hipcub/hipcub.hpp>
#if CUB_VERSION >= 300000
using AddOp = cub::Sum;
using MaxOp = cub::Max;
#endif
#endif

namespace vllm {
Expand All @@ -36,7 +45,12 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);

#if CUB_VERSION >= 300000
ss = BlockReduce(reduceStore).Reduce(ss, AddOp{}, blockDim.x);
#else
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
#endif

__shared__ float s_rms;
if (threadIdx.x == 0) {
Expand Down Expand Up @@ -71,9 +85,15 @@ __device__ void compute_dynamic_per_token_scales(

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
block_absmax_val_maybe =
BlockReduce(reduceStore)
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
#if CUB_VERSION >= 300000
block_absmax_val_maybe =
BlockReduce(reduceStore)
.Reduce(block_absmax_val_maybe, MaxOp{}, blockDim.x);
#else
block_absmax_val_maybe =
BlockReduce(reduceStore)
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
#endif

__shared__ float s_token_scale;
if (threadIdx.x == 0) {
Expand Down Expand Up @@ -169,7 +189,12 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);

#if CUB_VERSION >= 300000
ss = BlockReduce(reduceStore).Reduce(ss, AddOp{}, blockDim.x);
#else
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
#endif

__shared__ float s_rms;
if (threadIdx.x == 0) {
Expand Down Expand Up @@ -238,9 +263,16 @@ __device__ void compute_dynamic_per_token_scales(

using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
block_absmax_val_maybe =
BlockReduce(reduceStore)
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);

#if CUB_VERSION >= 300000
block_absmax_val_maybe =
BlockReduce(reduceStore)
.Reduce(block_absmax_val_maybe, MaxOp{}, blockDim.x);
#else
block_absmax_val_maybe =
BlockReduce(reduceStore)
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
#endif

__shared__ float s_token_scale;
if (threadIdx.x == 0) {
Expand Down
Loading