Skip to content

Commit d89a660

Browse files
committed
merge: branch 'main' of github.com:vllm-project/vllm into feat/jump-forward-structured-outputs
2 parents 1262acc + f25e0d1 commit d89a660

File tree

174 files changed

+6821
-1850
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

174 files changed

+6821
-1850
lines changed

.buildkite/scripts/upload-wheels.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,4 @@ else
7575
fi
7676

7777
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
78+
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ steps:
309309
commands:
310310
- pytest -v -s compile/test_pass_manager.py
311311
- pytest -v -s compile/test_fusion.py
312+
- pytest -v -s compile/test_silu_mul_quant_fusion.py
312313
- pytest -v -s compile/test_sequence_parallelism.py
313314

314315
- label: PyTorch Fullgraph Smoke Test # 9min

csrc/activation_kernels.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
7070
int64_t num_tokens = input.numel() / input.size(-1); \
7171
dim3 grid(num_tokens); \
7272
dim3 block(std::min(d, 1024)); \
73+
if (num_tokens == 0) { \
74+
return; \
75+
} \
7376
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
7477
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
7578
VLLM_DISPATCH_FLOATING_TYPES( \

csrc/dispatch_utils.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,19 @@
6565
AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
6666
AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
6767

68+
#define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \
69+
AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \
70+
AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \
71+
AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
72+
AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
73+
AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \
74+
AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__) \
75+
AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__) \
76+
AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__)
77+
6878
#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
6979
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
80+
81+
#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
82+
AT_DISPATCH_SWITCH( \
83+
TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))

csrc/moe/marlin_moe_wna16/marlin_template.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -473,15 +473,15 @@ __global__ void Marlin(
473473
if (mul_topk_weights) {
474474
#pragma unroll
475475
for (int i = 0; i < 4; i++) {
476+
int idx = tid4 * 4 + i;
477+
idx = idx < block_num_valid_tokens ? idx : 0;
476478
if constexpr (w_type == vllm::kFE2M1f) {
477-
sh_block_topk_weights[tid4 * 4 + i] = __hmul2(
478-
global_scale,
479-
Dtype::num2num2(Dtype::float2num(
480-
topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]])));
479+
sh_block_topk_weights[idx] = __hmul2(
480+
global_scale, Dtype::num2num2(Dtype::float2num(
481+
topk_weights_ptr[sh_block_sorted_ids[idx]])));
481482
} else {
482-
sh_block_topk_weights[tid4 * 4 + i] =
483-
Dtype::num2num2(Dtype::float2num(
484-
topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]]));
483+
sh_block_topk_weights[idx] = Dtype::num2num2(
484+
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
485485
}
486486
}
487487
}

csrc/moe/moe_align_sum_kernels.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
326326
}
327327

328328
if (use_global_memory) {
329-
VLLM_DISPATCH_INTEGRAL_TYPES(
329+
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
330330
topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
331331
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
332332
// tensors
@@ -351,7 +351,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
351351
cumsum_buffer.data_ptr<int32_t>());
352352
});
353353
} else if (use_i16) {
354-
VLLM_DISPATCH_INTEGRAL_TYPES(
354+
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
355355
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
356356
// set dynamic shared mem
357357
auto kernel =
@@ -366,7 +366,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
366366
topk_ids.numel());
367367
});
368368
} else {
369-
VLLM_DISPATCH_INTEGRAL_TYPES(
369+
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
370370
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
371371
auto kernel =
372372
vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
@@ -391,7 +391,7 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
391391
TORCH_CHECK(num_experts == 256,
392392
"sgl_moe_align_block_size kernel only supports deepseek v3.");
393393

394-
VLLM_DISPATCH_INTEGRAL_TYPES(
394+
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
395395
topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
396396
// calc needed amount of shared mem for `cumsum` tensors
397397
auto options_int =

csrc/moe/topk_softmax_kernels.cu

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,17 @@ __launch_bounds__(TPB) __global__
108108
}
109109
}
110110

111-
template <int TPB>
112-
__launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax, const bool* finished, float* output,
113-
int* indices, int* source_rows, const int num_experts, const int k, const int start_expert, const int end_expert)
111+
template <int TPB, typename IndType>
112+
__launch_bounds__(TPB) __global__ void moeTopK(
113+
const float* inputs_after_softmax,
114+
const bool* finished,
115+
float* output,
116+
IndType* indices,
117+
int* source_rows,
118+
const int num_experts,
119+
const int k,
120+
const int start_expert,
121+
const int end_expert)
114122
{
115123

116124
using cub_kvp = cub::KeyValuePair<int, float>;
@@ -182,9 +190,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax
182190
2) This implementation assumes k is small, but will work for any k.
183191
*/
184192

185-
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
193+
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, typename IndType>
186194
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
187-
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, int* indices,
195+
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
188196
int* source_rows, const int k, const int start_expert, const int end_expert)
189197
{
190198
// We begin by enforcing compile time assertions and setting up compile time constants.
@@ -397,8 +405,8 @@ struct TopkConstants
397405
};
398406
} // namespace detail
399407

400-
template <int EXPERTS, int WARPS_PER_TB>
401-
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, int* indices,
408+
template <int EXPERTS, int WARPS_PER_TB, typename IndType>
409+
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
402410
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
403411
{
404412
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
@@ -421,10 +429,11 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
421429
token_expert_indices, num_tokens, topk, 0, num_experts, \
422430
stream);
423431

432+
template <typename IndType>
424433
void topkGatingSoftmaxKernelLauncher(
425434
const float* gating_output,
426435
float* topk_weights,
427-
int* topk_indicies,
436+
IndType* topk_indicies,
428437
int* token_expert_indices,
429438
float* softmax_workspace,
430439
const int num_tokens,
@@ -493,14 +502,32 @@ void topk_softmax(
493502
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
494503
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
495504
torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
496-
vllm::moe::topkGatingSoftmaxKernelLauncher(
497-
gating_output.data_ptr<float>(),
498-
topk_weights.data_ptr<float>(),
499-
topk_indices.data_ptr<int>(),
500-
token_expert_indices.data_ptr<int>(),
501-
softmax_workspace.data_ptr<float>(),
502-
num_tokens,
503-
num_experts,
504-
topk,
505-
stream);
505+
506+
if(topk_indices.scalar_type() == at::ScalarType::Int)
507+
{
508+
vllm::moe::topkGatingSoftmaxKernelLauncher(
509+
gating_output.data_ptr<float>(),
510+
topk_weights.data_ptr<float>(),
511+
topk_indices.data_ptr<int>(),
512+
token_expert_indices.data_ptr<int>(),
513+
softmax_workspace.data_ptr<float>(),
514+
num_tokens,
515+
num_experts,
516+
topk,
517+
stream);
518+
}
519+
else
520+
{
521+
assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
522+
vllm::moe::topkGatingSoftmaxKernelLauncher(
523+
gating_output.data_ptr<float>(),
524+
topk_weights.data_ptr<float>(),
525+
topk_indices.data_ptr<uint32_t>(),
526+
token_expert_indices.data_ptr<int>(),
527+
softmax_workspace.data_ptr<float>(),
528+
num_tokens,
529+
num_experts,
530+
topk,
531+
stream);
532+
}
506533
}

csrc/pos_encoding_kernels.cu

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,17 @@ inline __device__ void apply_rotary_embedding(
4444
// head_size]
4545
const scalar_t* cache_ptr, const int head_size, const int num_heads,
4646
const int num_kv_heads, const int rot_dim, const int token_idx,
47-
const int64_t query_stride, const int64_t key_stride) {
47+
const int64_t query_stride, const int64_t key_stride,
48+
const int64_t head_stride) {
4849
const int embed_dim = rot_dim / 2;
4950
const scalar_t* cos_ptr = cache_ptr;
5051
const scalar_t* sin_ptr = cache_ptr + embed_dim;
5152

5253
const int nq = num_heads * embed_dim;
5354
for (int i = threadIdx.x; i < nq; i += blockDim.x) {
5455
const int head_idx = i / embed_dim;
55-
const int64_t token_head = token_idx * query_stride + head_idx * head_size;
56+
const int64_t token_head =
57+
token_idx * query_stride + head_idx * head_stride;
5658
const int rot_offset = i % embed_dim;
5759
apply_token_rotary_embedding<scalar_t, IS_NEOX>(
5860
query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
@@ -62,7 +64,8 @@ inline __device__ void apply_rotary_embedding(
6264
const int nk = num_kv_heads * embed_dim;
6365
for (int i = threadIdx.x; i < nk; i += blockDim.x) {
6466
const int head_idx = i / embed_dim;
65-
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
67+
const int64_t token_head =
68+
token_idx * key_stride + head_idx * head_stride;
6669
const int rot_offset = i % embed_dim;
6770
apply_token_rotary_embedding<scalar_t, IS_NEOX>(
6871
key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
@@ -84,15 +87,16 @@ __global__ void rotary_embedding_kernel(
8487
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
8588
// 2]
8689
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
87-
const int num_heads, const int num_kv_heads, const int head_size) {
90+
const int64_t head_stride, const int num_heads, const int num_kv_heads,
91+
const int head_size) {
8892
// Each thread block is responsible for one token.
8993
const int token_idx = blockIdx.x;
9094
int64_t pos = positions[token_idx];
9195
const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
9296

9397
apply_rotary_embedding<scalar_t, IS_NEOX>(
9498
query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
95-
token_idx, query_stride, key_stride);
99+
token_idx, query_stride, key_stride, head_stride);
96100
}
97101

98102
template <typename scalar_t, bool IS_NEOX>
@@ -109,9 +113,9 @@ __global__ void batched_rotary_embedding_kernel(
109113
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
110114
// 2]
111115
const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len]
112-
// or [num_tokens]
113116
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
114-
const int num_heads, const int num_kv_heads, const int head_size) {
117+
const int64_t head_stride, const int num_heads, const int num_kv_heads,
118+
const int head_size) {
115119
// Each thread block is responsible for one token.
116120
const int token_idx = blockIdx.x;
117121
int64_t pos = positions[token_idx];
@@ -121,7 +125,7 @@ __global__ void batched_rotary_embedding_kernel(
121125

122126
apply_rotary_embedding<scalar_t, IS_NEOX>(
123127
query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
124-
token_idx, query_stride, key_stride);
128+
token_idx, query_stride, key_stride, head_stride);
125129
}
126130

127131
} // namespace vllm
@@ -179,6 +183,12 @@ void rotary_embedding(
179183
int seq_dim_idx = positions_ndim - 1;
180184
int64_t query_stride = query.stride(seq_dim_idx);
181185
int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
186+
// Determine head stride: for [*, heads, head_size] use stride of last dim;
187+
// for flat [*, heads*head_size], heads blocks are contiguous of size
188+
// head_size
189+
int query_ndim = query.dim();
190+
int64_t head_stride =
191+
(query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
182192

183193
dim3 grid(num_tokens);
184194
dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
@@ -190,14 +200,14 @@ void rotary_embedding(
190200
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
191201
key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
192202
cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride, key_stride,
193-
num_heads, num_kv_heads, head_size);
203+
head_stride, num_heads, num_kv_heads, head_size);
194204
} else {
195205
vllm::rotary_embedding_kernel<scalar_t, false>
196206
<<<grid, block, 0, stream>>>(
197207
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
198208
key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
199209
cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride,
200-
key_stride, num_heads, num_kv_heads, head_size);
210+
key_stride, head_stride, num_heads, num_kv_heads, head_size);
201211
}
202212
});
203213
}
@@ -263,6 +273,12 @@ void batched_rotary_embedding(
263273
int seq_dim_idx = positions_ndim - 1;
264274
int64_t query_stride = query.stride(seq_dim_idx);
265275
int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
276+
// Determine head stride: for [*, heads, head_size] use stride of last dim;
277+
// for flat [*, heads*head_size], heads blocks are contiguous of size
278+
// head_size
279+
int query_ndim = query.dim();
280+
int64_t head_stride =
281+
(query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
266282

267283
dim3 grid(num_tokens);
268284
dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
@@ -276,15 +292,15 @@ void batched_rotary_embedding(
276292
key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
277293
cos_sin_cache.data_ptr<scalar_t>(),
278294
cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
279-
key_stride, num_heads, num_kv_heads, head_size);
295+
key_stride, head_stride, num_heads, num_kv_heads, head_size);
280296
} else {
281297
vllm::batched_rotary_embedding_kernel<scalar_t, false>
282298
<<<grid, block, 0, stream>>>(
283299
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
284300
key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
285301
cos_sin_cache.data_ptr<scalar_t>(),
286302
cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
287-
key_stride, num_heads, num_kv_heads, head_size);
303+
key_stride, head_stride, num_heads, num_kv_heads, head_size);
288304
}
289305
});
290306
}

csrc/quantization/activation_kernels.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ __global__ void act_and_mul_quant_kernel(
112112
void silu_and_mul_quant(torch::Tensor& out, // [..., d]
113113
torch::Tensor& input, // [..., 2 * d]
114114
torch::Tensor& scale) {
115-
TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn);
115+
TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn ||
116+
out.dtype() == torch::kFloat8_e4m3fnuz);
116117
TORCH_CHECK(input.dtype() == torch::kFloat16 ||
117118
input.dtype() == torch::kBFloat16);
118119
TORCH_CHECK(input.size(-1) % 2 == 0);

docs/source/design/v1/metrics.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -415,8 +415,8 @@ The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
415415
some interesting points which may be relevant to how we approach
416416
future metrics.
417417

418-
Every time the prefix cache is queried, we record the number of blocks
419-
queried and the number of queried blocks present in the cache
418+
Every time the prefix cache is queried, we record the number of tokens
419+
queried and the number of queried tokens present in the cache
420420
(i.e. hits).
421421

422422
However, the metric of interest is the hit rate - i.e. the number of

0 commit comments

Comments
 (0)