vllm-project
diff --git a/‎.buildkite/scripts/upload-wheels.sh‎
Lines changed: 1 addition & 0 deletions b/‎.buildkite/scripts/upload-wheels.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/activation_kernels.cu‎
Lines changed: 3 additions & 0 deletions b/‎csrc/activation_kernels.cu‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎csrc/dispatch_utils.h‎
Lines changed: 14 additions & 0 deletions b/‎csrc/dispatch_utils.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎csrc/moe/marlin_moe_wna16/marlin_template.h‎
Lines changed: 7 additions & 7 deletions b/‎csrc/moe/marlin_moe_wna16/marlin_template.h‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎csrc/moe/moe_align_sum_kernels.cu‎
Lines changed: 4 additions & 4 deletions b/‎csrc/moe/moe_align_sum_kernels.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎csrc/moe/topk_softmax_kernels.cu‎
Lines changed: 45 additions & 18 deletions b/‎csrc/moe/topk_softmax_kernels.cu‎
Lines changed: 45 additions & 18 deletions
diff --git a/‎csrc/pos_encoding_kernels.cu‎
Lines changed: 28 additions & 12 deletions b/‎csrc/pos_encoding_kernels.cu‎
Lines changed: 28 additions & 12 deletions
diff --git a/‎csrc/quantization/activation_kernels.cu‎
Lines changed: 2 additions & 1 deletion b/‎csrc/quantization/activation_kernels.cu‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/source/design/v1/metrics.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/design/v1/metrics.md‎
Lines changed: 2 additions & 2 deletions
@@ -75,3 +75,4 @@ else
 fi
 
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
@@ -309,6 +309,7 @@ steps:
   commands:
     - pytest -v -s compile/test_pass_manager.py
     - pytest -v -s compile/test_fusion.py
+    - pytest -v -s compile/test_silu_mul_quant_fusion.py
     - pytest -v -s compile/test_sequence_parallelism.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
 
@@ -70,6 +70,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
   int64_t num_tokens = input.numel() / input.size(-1);                   \
   dim3 grid(num_tokens);                                                 \
   dim3 block(std::min(d, 1024));                                         \
+  if (num_tokens == 0) {                                                 \
+    return;                                                              \
+  }                                                                      \
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
   VLLM_DISPATCH_FLOATING_TYPES(                                          \
 
@@ -65,5 +65,19 @@
   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
 
+#define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)        \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__)
+
 #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                              \
+      TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
@@ -473,15 +473,15 @@ __global__ void Marlin(
       if (mul_topk_weights) {
   #pragma unroll
         for (int i = 0; i < 4; i++) {
+          int idx = tid4 * 4 + i;
+          idx = idx < block_num_valid_tokens ? idx : 0;
           if constexpr (w_type == vllm::kFE2M1f) {
-            sh_block_topk_weights[tid4 * 4 + i] = __hmul2(
-                global_scale,
-                Dtype::num2num2(Dtype::float2num(
-                    topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]])));
+            sh_block_topk_weights[idx] = __hmul2(
+                global_scale, Dtype::num2num2(Dtype::float2num(
+                                  topk_weights_ptr[sh_block_sorted_ids[idx]])));
           } else {
-            sh_block_topk_weights[tid4 * 4 + i] =
-                Dtype::num2num2(Dtype::float2num(
-                    topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]]));
+            sh_block_topk_weights[idx] = Dtype::num2num2(
+                Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
           }
         }
       }
 
@@ -326,7 +326,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   }
 
   if (use_global_memory) {
-    VLLM_DISPATCH_INTEGRAL_TYPES(
+    VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
           // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
           // tensors
@@ -351,7 +351,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               cumsum_buffer.data_ptr<int32_t>());
         });
   } else if (use_i16) {
-    VLLM_DISPATCH_INTEGRAL_TYPES(
+    VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
           // set dynamic shared mem
           auto kernel =
@@ -366,7 +366,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               topk_ids.numel());
         });
   } else {
-    VLLM_DISPATCH_INTEGRAL_TYPES(
+    VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
           auto kernel =
               vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
@@ -391,7 +391,7 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   TORCH_CHECK(num_experts == 256,
               "sgl_moe_align_block_size kernel only supports deepseek v3.");
 
-  VLLM_DISPATCH_INTEGRAL_TYPES(
+  VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
       topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
         // calc needed amount of shared mem for `cumsum` tensors
         auto options_int =
 
@@ -108,9 +108,17 @@ __launch_bounds__(TPB) __global__
     }
 }
 
-template <int TPB>
-__launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax, const bool* finished, float* output,
-    int* indices, int* source_rows, const int num_experts, const int k, const int start_expert, const int end_expert)
+template <int TPB, typename IndType>
+__launch_bounds__(TPB) __global__ void moeTopK(
+    const float* inputs_after_softmax,
+    const bool* finished,
+    float* output,
+    IndType* indices,
+    int* source_rows,
+    const int num_experts,
+    const int k,
+    const int start_expert,
+    const int end_expert)
 {
 
     using cub_kvp = cub::KeyValuePair<int, float>;
@@ -182,9 +190,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax
   2) This implementation assumes k is small, but will work for any k.
 */
 
-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, typename IndType>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
-    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, int* indices,
+    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
         int* source_rows, const int k, const int start_expert, const int end_expert)
 {
     // We begin by enforcing compile time assertions and setting up compile time constants.
@@ -397,8 +405,8 @@ struct TopkConstants
 };
 } // namespace detail
 
-template <int EXPERTS, int WARPS_PER_TB>
-void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, int* indices,
+template <int EXPERTS, int WARPS_PER_TB, typename IndType>
+void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
     int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
 {
     static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
@@ -421,10 +429,11 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
         token_expert_indices, num_tokens, topk, 0, num_experts,         \
         stream);
 
+template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
     const float* gating_output,
     float* topk_weights,
-    int* topk_indicies,
+    IndType* topk_indicies,
     int* token_expert_indices,
     float* softmax_workspace,
     const int num_tokens,
@@ -493,14 +502,32 @@ void topk_softmax(
     const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
-    vllm::moe::topkGatingSoftmaxKernelLauncher(
-        gating_output.data_ptr<float>(),
-        topk_weights.data_ptr<float>(),
-        topk_indices.data_ptr<int>(),
-        token_expert_indices.data_ptr<int>(),
-        softmax_workspace.data_ptr<float>(),
-        num_tokens,
-        num_experts,
-        topk,
-        stream);
+
+    if(topk_indices.scalar_type() == at::ScalarType::Int)
+    {
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
+    else
+    {
+        assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<uint32_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
 }
@@ -44,15 +44,17 @@ inline __device__ void apply_rotary_embedding(
                                    // head_size]
     const scalar_t* cache_ptr, const int head_size, const int num_heads,
     const int num_kv_heads, const int rot_dim, const int token_idx,
-    const int64_t query_stride, const int64_t key_stride) {
+    const int64_t query_stride, const int64_t key_stride,
+    const int64_t head_stride) {
   const int embed_dim = rot_dim / 2;
   const scalar_t* cos_ptr = cache_ptr;
   const scalar_t* sin_ptr = cache_ptr + embed_dim;
 
   const int nq = num_heads * embed_dim;
   for (int i = threadIdx.x; i < nq; i += blockDim.x) {
     const int head_idx = i / embed_dim;
-    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
+    const int64_t token_head =
+        token_idx * query_stride + head_idx * head_stride;
     const int rot_offset = i % embed_dim;
     apply_token_rotary_embedding<scalar_t, IS_NEOX>(
         query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
@@ -62,7 +64,8 @@ inline __device__ void apply_rotary_embedding(
     const int nk = num_kv_heads * embed_dim;
     for (int i = threadIdx.x; i < nk; i += blockDim.x) {
       const int head_idx = i / embed_dim;
-      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+      const int64_t token_head =
+          token_idx * key_stride + head_idx * head_stride;
       const int rot_offset = i % embed_dim;
       apply_token_rotary_embedding<scalar_t, IS_NEOX>(
           key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
@@ -84,15 +87,16 @@ __global__ void rotary_embedding_kernel(
     const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
                                                  // 2]
     const int rot_dim, const int64_t query_stride, const int64_t key_stride,
-    const int num_heads, const int num_kv_heads, const int head_size) {
+    const int64_t head_stride, const int num_heads, const int num_kv_heads,
+    const int head_size) {
   // Each thread block is responsible for one token.
   const int token_idx = blockIdx.x;
   int64_t pos = positions[token_idx];
   const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
 
   apply_rotary_embedding<scalar_t, IS_NEOX>(
       query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
-      token_idx, query_stride, key_stride);
+      token_idx, query_stride, key_stride, head_stride);
 }
 
 template <typename scalar_t, bool IS_NEOX>
@@ -109,9 +113,9 @@ __global__ void batched_rotary_embedding_kernel(
     const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
                                                  // 2]
     const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len]
-                                                        // or [num_tokens]
     const int rot_dim, const int64_t query_stride, const int64_t key_stride,
-    const int num_heads, const int num_kv_heads, const int head_size) {
+    const int64_t head_stride, const int num_heads, const int num_kv_heads,
+    const int head_size) {
   // Each thread block is responsible for one token.
   const int token_idx = blockIdx.x;
   int64_t pos = positions[token_idx];
@@ -121,7 +125,7 @@ __global__ void batched_rotary_embedding_kernel(
 
   apply_rotary_embedding<scalar_t, IS_NEOX>(
       query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
-      token_idx, query_stride, key_stride);
+      token_idx, query_stride, key_stride, head_stride);
 }
 
 }  // namespace vllm
@@ -179,6 +183,12 @@ void rotary_embedding(
   int seq_dim_idx = positions_ndim - 1;
   int64_t query_stride = query.stride(seq_dim_idx);
   int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
+  // Determine head stride: for [*, heads, head_size] use stride of last dim;
+  // for flat [*, heads*head_size], heads blocks are contiguous of size
+  // head_size
+  int query_ndim = query.dim();
+  int64_t head_stride =
+      (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
 
   dim3 grid(num_tokens);
   dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
@@ -190,14 +200,14 @@ void rotary_embedding(
           positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
           key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
           cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride, key_stride,
-          num_heads, num_kv_heads, head_size);
+          head_stride, num_heads, num_kv_heads, head_size);
     } else {
       vllm::rotary_embedding_kernel<scalar_t, false>
           <<<grid, block, 0, stream>>>(
               positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
               key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
               cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride,
-              key_stride, num_heads, num_kv_heads, head_size);
+              key_stride, head_stride, num_heads, num_kv_heads, head_size);
     }
   });
 }
@@ -263,6 +273,12 @@ void batched_rotary_embedding(
   int seq_dim_idx = positions_ndim - 1;
   int64_t query_stride = query.stride(seq_dim_idx);
   int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
+  // Determine head stride: for [*, heads, head_size] use stride of last dim;
+  // for flat [*, heads*head_size], heads blocks are contiguous of size
+  // head_size
+  int query_ndim = query.dim();
+  int64_t head_stride =
+      (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
 
   dim3 grid(num_tokens);
   dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
@@ -276,15 +292,15 @@ void batched_rotary_embedding(
               key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
               cos_sin_cache.data_ptr<scalar_t>(),
               cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
-              key_stride, num_heads, num_kv_heads, head_size);
+              key_stride, head_stride, num_heads, num_kv_heads, head_size);
     } else {
       vllm::batched_rotary_embedding_kernel<scalar_t, false>
           <<<grid, block, 0, stream>>>(
               positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
               key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
               cos_sin_cache.data_ptr<scalar_t>(),
               cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
-              key_stride, num_heads, num_kv_heads, head_size);
+              key_stride, head_stride, num_heads, num_kv_heads, head_size);
     }
   });
 }
@@ -112,7 +112,8 @@ __global__ void act_and_mul_quant_kernel(
 void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
                         torch::Tensor& input,  // [..., 2 * d]
                         torch::Tensor& scale) {
-  TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn ||
+              out.dtype() == torch::kFloat8_e4m3fnuz);
   TORCH_CHECK(input.dtype() == torch::kFloat16 ||
               input.dtype() == torch::kBFloat16);
   TORCH_CHECK(input.size(-1) % 2 == 0);
 
@@ -415,8 +415,8 @@ The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
 some interesting points which may be relevant to how we approach
 future metrics.
 
-Every time the prefix cache is queried, we record the number of blocks
-queried and the number of queried blocks present in the cache
+Every time the prefix cache is queried, we record the number of tokens
+queried and the number of queried tokens present in the cache
 (i.e. hits).
 
 However, the metric of interest is the hit rate - i.e. the number of
Original file line number	Diff line number	Diff line change
`@@ -75,3 +75,4 @@ else`
`75`	`75`	`fi`
`76`	`76`
`77`	`77`	`aws s3 cp "$wheel" "s3://vllm-wheels/$version/"`
	`78`	`+aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"`