vllm-project
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 10 additions & 0 deletions b/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎csrc/cpu/torch_bindings.cpp‎
Lines changed: 4 additions & 4 deletions b/‎csrc/cpu/torch_bindings.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎csrc/custom_all_reduce.cu‎
Lines changed: 11 additions & 0 deletions b/‎csrc/custom_all_reduce.cu‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎csrc/ops.h‎
Lines changed: 8 additions & 32 deletions b/‎csrc/ops.h‎
Lines changed: 8 additions & 32 deletions
diff --git a/‎csrc/quantization/gptq_marlin/awq_marlin_repack.cu‎
Lines changed: 4 additions & 3 deletions b/‎csrc/quantization/gptq_marlin/awq_marlin_repack.cu‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎csrc/quantization/gptq_marlin/gptq_marlin_repack.cu‎
Lines changed: 5 additions & 5 deletions b/‎csrc/quantization/gptq_marlin/gptq_marlin_repack.cu‎
Lines changed: 5 additions & 5 deletions
@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
     <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
 </ul>
 
+<h3>Adding or changing kernels</h3>
+<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
+<ul>
+    <li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
+    <li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
+    <li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops.  See <code>tests/kernels</code> for examples.</li>
+    <li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
+    <li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
+</ul>
+
 <h3>Notes for Large Changes</h3>
 <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
 
 
@@ -27,8 +27,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // PagedAttention V2.
   ops.def(
       "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
@@ -95,8 +95,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
 
   // Copy the cache blocks from src to dst.
   cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
-      "block_mapping) -> ()");
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
+      "Tensor block_mapping) -> ()");
   cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
 
   // Reshape the key and value tensors and cache them.
 
@@ -146,6 +146,17 @@ std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
   return {handles, std::move(offsets)};
 }
 
+std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta_meta(
+    fptr_t _fa) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
+  auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handles =
+      torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
+  return {handles, std::move(offsets)};
+}
+
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
                             const std::vector<std::vector<int64_t>>& offsets) {
   auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
 
@@ -70,28 +70,15 @@ torch::Tensor aqlm_dequant(
     const torch::Tensor& codes, const torch::Tensor& codebooks,
     const std::vector<int64_t>& codebook_partition_sizes);
 
-torch::Tensor aqlm_dequant_meta(const torch::Tensor& codes,
-                                const torch::Tensor& codebooks,
-                                const torch::Tensor& codebook_partition_sizes);
-
 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                        torch::Tensor _scaling_factors, torch::Tensor _zeros,
                        int64_t split_k_iters);
 
-torch::Tensor awq_gemm_meta(torch::Tensor _in_feats, torch::Tensor _kernel,
-                            torch::Tensor _scaling_factors,
-                            torch::Tensor _zeros, int64_t split_k_iters);
-
 torch::Tensor awq_dequantize(torch::Tensor _kernel,
                              torch::Tensor _scaling_factors,
                              torch::Tensor _zeros, int64_t split_k_iters,
                              int64_t thx, int64_t thy);
 
-torch::Tensor awq_dequantize_meta(torch::Tensor _kernel,
-                                  torch::Tensor _scaling_factors,
-                                  torch::Tensor _zeros, int64_t split_k_iters,
-                                  int64_t thx, int64_t thy);
-
 torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                           torch::Tensor& b_scales, torch::Tensor& workspace,
                           int64_t size_m, int64_t size_n, int64_t size_k);
@@ -123,11 +110,6 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                   int64_t size_m, int64_t size_n,
                                   int64_t size_k);
 
-torch::Tensor gptq_marlin_24_gemm_meta(
-    torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& b_meta,
-    torch::Tensor& b_scales, torch::Tensor& workspace, int64_t num_bits,
-    int64_t size_m, int64_t size_n, int64_t size_k);
-
 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& b_scales, torch::Tensor& b_zeros,
                                torch::Tensor& g_idx, torch::Tensor& perm,
@@ -137,23 +119,21 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                bool is_k_full, bool has_zp,
                                bool use_fp32_reduce);
 
-torch::Tensor gptq_marlin_gemm_meta(
-    torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& b_scales,
-    torch::Tensor& b_zeros, torch::Tensor& g_idx, torch::Tensor& perm,
-    torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n,
-    int64_t size_k, bool is_k_full, bool has_zp);
-
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
 torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                      torch::Tensor& perm, int64_t size_k,
-                                      int64_t size_n, int64_t num_bits);
+                                      torch::Tensor& perm, c10::SymInt size_k,
+                                      c10::SymInt size_n, int64_t num_bits);
 
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
                                 int64_t size_n, int64_t num_bits);
 
+torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                     c10::SymInt size_k, c10::SymInt size_n,
+                                     int64_t num_bits);
+
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
                               int64_t n);
 
@@ -168,12 +148,6 @@ torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               int64_t num_bits, int64_t size_m, int64_t size_n,
                               int64_t size_k);
 
-torch::Tensor fp8_marlin_gemm_meta(torch::Tensor& a, torch::Tensor& b_q_weight,
-                                   torch::Tensor& b_scales,
-                                   torch::Tensor& workspace, int64_t num_bits,
-                                   int64_t size_m, int64_t size_n,
-                                   int64_t size_k);
-
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
@@ -281,6 +255,8 @@ void register_buffer(fptr_t _fa, torch::Tensor& t,
                      const std::vector<int64_t>& offsets);
 std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
     fptr_t _fa);
+std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta_meta(
+    fptr_t _fa);
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
 #endif
@@ -268,13 +268,14 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
 
 #endif
 
-torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight, int64_t size_k,
-                                     int64_t size_n, int64_t num_bits) {
+torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                     c10::SymInt size_k, c10::SymInt size_n,
+                                     int64_t num_bits) {
   int const pack_factor = 32 / num_bits;
   auto options = torch::TensorOptions()
                      .dtype(b_q_weight.dtype())
                      .device(b_q_weight.device());
-  return torch::empty(
+  return torch::empty_symint(
       {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
       options);
 }
@@ -344,13 +344,13 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 #endif
 
 torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                      torch::Tensor& perm, int64_t size_k,
-                                      int64_t size_n, int64_t num_bits) {
+                                      torch::Tensor& perm, c10::SymInt size_k,
+                                      c10::SymInt size_n, int64_t num_bits) {
   int const pack_factor = 32 / num_bits;
   auto options = torch::TensorOptions()
                      .dtype(b_q_weight.dtype())
                      .device(b_q_weight.device());
-  return torch::empty({size_k / marlin::tile_size,
-                       size_n * marlin::tile_size / pack_factor},
-                      options);
+  return torch::empty_symint(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
 }