vllm-project · bnellnm · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024 · Aug 1, 2024
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
     <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
 </ul>
 
+<h3>Adding or changing kernels</h3>
+<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
+<ul>
+    <li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
+    <li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
+    <li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops.  See <code>tests/kernels</code> for examples.</li>
+    <li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
+    <li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
+</ul>
+
 <h3>Notes for Large Changes</h3>
 <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
 

@@ -213,7 +213,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/custom_all_reduce.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+    "csrc/quantization/layernorm_kernels/rms_norm_quant.cu"
+    "csrc/quantization/layernorm_kernels/activation_kernels.cu")
 
   #
   # The CUTLASS kernels for Hopper require sm90a to be enabled.

@@ -27,8 +27,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // PagedAttention V2.
   ops.def(
       "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
@@ -95,8 +95,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
 
   // Copy the cache blocks from src to dst.
   cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
-      "block_mapping) -> ()");
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
+      "Tensor block_mapping) -> ()");
   cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
 
   // Reshape the key and value tensors and cache them.

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -123,9 +123,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
+torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                      torch::Tensor& perm, c10::SymInt size_k,
+                                      c10::SymInt size_n, int64_t num_bits);
+
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
                                 int64_t size_n, int64_t num_bits);
 
+torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                     c10::SymInt size_k, c10::SymInt size_n,
+                                     int64_t num_bits);
+
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
                               int64_t n);
 
@@ -164,6 +172,35 @@ torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
                               int64_t size_n, int64_t size_k);
 #endif
 
+// These are kernels used by qqq
+// torch::Tensor qqq_gemm(
+//     torch::Tensor& a,
+//     torch::Tensor& b_q_weight,
+//     torch::Tensor& s1,
+//     torch::Tensor& s2,
+//     torch::Tensor& s3,
+//     torch::Tensor& workspace,
+//     int64_t size_m,
+//     int64_t size_n,
+//     int64_t size_k);
+
+void rms_norm_quant(torch::Tensor& out, torch::Tensor const& input,
+                    torch::Tensor& tmp, torch::Tensor const& weight,
+                    torch::Tensor& scale, double const epsilon);
+
+void add_residual_rms_norm_quant(torch::Tensor& out, torch::Tensor const& input,
+                                 torch::Tensor& residual, torch::Tensor& tmp,
+                                 torch::Tensor const& weight,
+                                 torch::Tensor& scale, double const epsilon);
+
+void silu_and_mul_quant(torch::Tensor& out, torch::Tensor const& input,
+                        torch::Tensor& scale, torch::Tensor& tmp);
+
+// void quant(
+//   torch::Tensor& out,
+//   torch::Tensor& input,
+//   torch::Tensor& scale);
+
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor const& scale);
 
@@ -178,6 +215,11 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
                         bool use_exllama, int64_t bit);
 
+torch::Tensor gptq_gemm_meta(torch::Tensor a, torch::Tensor b_q_weight,
+                             torch::Tensor b_gptq_qzeros,
+                             torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
+                             bool use_exllama, int64_t bit);
+
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,

diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -595,3 +595,34 @@ torch::Tensor aqlm_dequant(
               " entries is not currently supported.")
   return {};
 }
+
+torch::Tensor aqlm_gemm_meta(const torch::Tensor& input,
+                             const torch::Tensor& codes,
+                             const torch::Tensor& codebooks,
+                             const torch::Tensor& scales,
+                             const torch::Tensor& codebook_partition_sizes,
+                             const std::optional<torch::Tensor>& bias) {
+
+
+  auto out_features = codes.size(0) * codebooks.size(2);
+  auto flat_input = input.reshape({-1, input.size(-1)});
+  auto flat_output = torch::empty(
+      {flat_input.size(0), out_features},
+      torch::TensorOptions().dtype(input.dtype()).device(input.device()));
+
+  auto output_sizes = input.sizes().vec();
+  output_sizes.pop_back();
+  output_sizes.push_back(-1);
+  return flat_output.reshape(output_sizes);
+}
+
+torch::Tensor aqlm_dequant_meta(const torch::Tensor& codes,
+                                const torch::Tensor& codebooks,
+                                const torch::Tensor& codebook_partition_sizes) {
+  auto in_features = codes.size(1) * 8;
+  auto out_features = codes.size(0);
+  return torch::empty({out_features, in_features},
+                      torch::TensorOptions()
+                          .dtype(codebooks.dtype())
+                          .device(codebooks.device()));
+}
diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
@@ -524,3 +524,29 @@ torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
   }
   return _out_feats.sum(0);
 }
+
+torch::Tensor awq_gemm_meta(torch::Tensor _in_feats, torch::Tensor _kernel,
+                            torch::Tensor _scaling_factors,
+                            torch::Tensor _zeros, int64_t split_k_iters) {
+  auto num_in_feats = _in_feats.size(0);
+  auto options = torch::TensorOptions()
+                     .dtype(_in_feats.dtype())
+                     .device(_in_feats.device());
+  return torch::empty({split_k_iters, num_in_feats, _kernel.size(1) * 8},
+                      options).sum(0);
+}
+
+torch::Tensor awq_dequantize_meta(torch::Tensor _kernel,
+                                  torch::Tensor _scaling_factors,
+                                  torch::Tensor _zeros, int64_t split_k_iters,
+                                  int64_t thx, int64_t thy) {
+  auto in_c = _kernel.size(0);
+  auto qout_c = _kernel.size(1);
+  auto out_c = qout_c * 8;
+
+  auto options = torch::TensorOptions()
+                     .dtype(_scaling_factors.dtype())
+                     .device(_scaling_factors.device());
+
+  return torch::empty({in_c, out_c}, options);
+}
diff --git a/csrc/quantization/fp8/fp8_marlin.cu b/csrc/quantization/fp8/fp8_marlin.cu
@@ -1303,3 +1303,12 @@ torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
 }
 
 #endif
+
+torch::Tensor fp8_marlin_gemm_meta(torch::Tensor& a, torch::Tensor& b_q_weight,
+                                   torch::Tensor& b_scales,
+                                   torch::Tensor& workspace, int64_t num_bits,
+                                   int64_t size_m, int64_t size_n,
+                                   int64_t size_k) {
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  return torch::empty({size_m, size_n}, options);
+}
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
@@ -1854,3 +1854,20 @@ void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
           : (int*)q_perm.data_ptr(),
       q_weight.size(0) * 32 / bit, q_weight.size(1), bit);
 }
+
+torch::Tensor gptq_gemm_meta(torch::Tensor a, torch::Tensor b_q_weight,
+                             torch::Tensor b_gptq_qzeros,
+                             torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
+                             bool use_exllama, int64_t bit) {
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+#if 0
+  // TODO: this might not be quite right, add check for symbolic dims and only
+  // use when needed?
+  auto const m = a.sym_size(0);
+  auto const n = b_q_weight.sym_size(1);
+  auto res = torch::empty_symint({m, n}, options);
+#else
+  auto res = torch::empty({a.size(0), b_q_weight.size(1)}, options);
+#endif
+  return res;
+}
diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -267,3 +267,15 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
 }
 
 #endif
+
+torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                     c10::SymInt size_k, c10::SymInt size_n,
+                                     int64_t num_bits) {
+  int const pack_factor = 32 / num_bits;
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  return torch::empty_symint(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+}
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -2297,3 +2297,12 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
 }
 
 #endif
+
+torch::Tensor gptq_marlin_gemm_meta(
+    torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_zeros, torch::Tensor& g_idx, torch::Tensor& perm,
+    torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool has_zp) {
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  return torch::empty({size_m, size_n}, options);
+}
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -342,3 +342,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 }
 
 #endif
+
+torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                      torch::Tensor& perm, c10::SymInt size_k,
+                                      c10::SymInt size_n, int64_t num_bits) {
+  int const pack_factor = 32 / num_bits;
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  return torch::empty_symint(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+}
diff --git a/csrc/quantization/layernorm_kernels/activation_kernels.cu b/csrc/quantization/layernorm_kernels/activation_kernels.cu
@@ -0,0 +1,114 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../../cuda_compat.h"
+#include "../../dispatch_utils.h"
+#include "../../reduction_utils.cuh"
+// #include "quant_utils.cuh"
+#ifndef USE_ROCM
+using FP8_TYPE = c10::Float8_e4m3fn;
+C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX =
+    std::numeric_limits<FP8_TYPE>::max();
+#else
+  #include "amd/hip_float8.h"
+using FP8_TYPE = c10::Float8_e4m3fnuz;
+// Using the default max value from pytorch (240.0) will cause accuracy
+// issue when running dynamic quantization. Here use 224.0f for rocm.
+constexpr auto FP8_E4M3_MAX = 224.0f;
+#endif
+namespace vllm {
+
+template <bool is_scale_inverted>
+__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
+                                                          float const scale) {
+  float x = 0.0f;
+  if constexpr (is_scale_inverted) {
+    x = val * scale;
+  } else {
+    x = val / scale;
+  }
+
+  float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
+#ifndef USE_ROCM
+  return static_cast<c10::Float8_e4m3fn>(r);
+#else
+  // Use hardware cvt instruction for fp8 on rocm
+  return c10::Float8_e4m3fnuz(hip_fp8(r).data,
+                              c10::Float8_e4m3fnuz::from_bits());
+#endif
+}
+
+static inline __device__ int8_t float_to_int8_rn(float x) {
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+}
+
+template <typename T>
+__device__ __forceinline__ T silu(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+template <typename scalar_t>
+__global__ void silu_and_mul_quant_kernel(
+    FP8_TYPE* __restrict__ out,            // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2 * d]
+    const int d,
+    float* __restrict__ scale,  // [num_tokens]
+    float* __restrict__ tmp) {
+  const int64_t token_idx = blockIdx.x;
+  // float amax_val = 0.0f;
+
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    // const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    // const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    // scalar_t t = silu(x) * y;
+    // input[token_idx * 2 * d + idx] = t;
+    // amax_val = fmaxf(amax_val, fabsf((float) t));
+    const float x = (float)VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const float y = (float)VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    float t = silu(x) * y;
+    tmp[token_idx * d + idx] = t;
+    // amax_val = fmaxf(amax_val, fabsf(t));
+  }
+
+  // __shared__ float s_amax;
+  // amax_val = blockReduceMax(amax_val);
+  // if (threadIdx.x == 0) {
+  //   s_amax = amax_val;
+  //   // scale[blockIdx.x] = amax_val / 127.0f;
+  // }
+  // __syncthreads();
+
+  // float tmp_scale = 127.0f / s_amax;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    // out[token_idx * d + idx] =
+    //     float_to_int8_rn(tmp_scale * (float) input[token_idx * 2 * d + idx]);
+    // out[token_idx * d + idx] =
+    //     float_to_int8_rn(tmp_scale * tmp[token_idx * d + idx]);
+    out[token_idx * d + idx] = scaled_fp8_conversion<false>(
+        tmp[token_idx * d + idx], *scale);
+  }
+}
+}  // namespace vllm
+
+void silu_and_mul_quant(torch::Tensor& out,          // [..., d]
+                        torch::Tensor const& input,  // [..., 2 * d]
+                        torch::Tensor& scale,        // [num_tokens]
+                        torch::Tensor& tmp           // [..., d]
+) {
+  int d = input.size(-1) / 2;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+  dim3 block(std::min(d, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "silu_and_mul_quant_kernel", [&] {
+        vllm::silu_and_mul_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(), d,
+            scale.data_ptr<float>(), tmp.data_ptr<float>());
+      });
+}
diff --git a/csrc/quantization/layernorm_kernels/reduction_utils.cuh b/csrc/quantization/layernorm_kernels/reduction_utils.cuh