From 69166a8d1f64f805681a928ee36857f85bc0383b Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Wed, 2 Apr 2025 18:49:20 +0800 Subject: [PATCH] fix use_atomic_add support of marlin kernel when using v1 engine Signed-off-by: Jinzhen Lin --- csrc/quantization/gptq_marlin/gptq_marlin.cu | 6 +++++- .../layers/quantization/utils/marlin_utils.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 14d397d03e13..83bbd1e6816a 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -1785,7 +1785,7 @@ __global__ void Marlin( <<>>( \ A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \ num_groups, prob_m, prob_n, prob_k, lda, locks, \ - use_atomic_add, use_fp32_reduce); \ + part_use_atomic_add, use_fp32_reduce); \ } \ } @@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, thread_m_blocks = exec_cfg.max_m_blocks; } + // atomic add reduce have better performance only when m * n is small + bool part_use_atomic_add = + use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048; + if (false) { } GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index d1fb52ae09de..5b2e3ca2c799 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -305,7 +305,7 @@ def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device, # the performance of atomicAdd is better than global reduce # only when m*n is small and k is large - return max(m, 64) * n < 64 * 2048 and k >= 2048 + return n < 2048 and k >= 2048 def apply_gptq_marlin_linear(