diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu index b0f31c45e731..dbbb97e6fb3a 100644 --- a/csrc/quantization/gguf/gguf_kernel.cu +++ b/csrc/quantization/gguf/gguf_kernel.cu @@ -375,25 +375,25 @@ torch::Tensor ggml_moe_a8(torch::Tensor X, // input int64_t ggml_moe_get_block_size(int64_t type) { switch (type) { case 2: - return MMQ_X_Q4_0; + return MOE_X_Q4_0; case 3: - return MMQ_X_Q4_1; + return MOE_X_Q4_1; case 6: - return MMQ_X_Q5_0; + return MOE_X_Q5_0; case 7: - return MMQ_X_Q5_1; + return MOE_X_Q5_1; case 8: - return MMQ_X_Q8_0; + return MOE_X_Q8_0; case 10: - return MMQ_X_Q2_K; + return MOE_X_Q2_K; case 11: - return MMQ_X_Q3_K; + return MOE_X_Q3_K; case 12: - return MMQ_X_Q4_K; + return MOE_X_Q4_K; case 13: - return MMQ_X_Q5_K; + return MOE_X_Q5_K; case 14: - return MMQ_X_Q6_K; + return MOE_X_Q6_K; } return 0; } diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh index 2dbafc0f7422..c10c59d7a38a 100644 --- a/csrc/quantization/gguf/moe.cuh +++ b/csrc/quantization/gguf/moe.cuh @@ -129,12 +129,12 @@ static __device__ __forceinline__ void moe_q( } #if defined(USE_ROCM) - #define MMQ_X_Q4_0 64 - #define MMQ_Y_Q4_0 128 + #define MOE_X_Q4_0 64 + #define MOE_Y_Q4_0 128 #define NWARPS_Q4_0 8 #else - #define MMQ_X_Q4_0 4 - #define MMQ_Y_Q4_0 32 + #define MOE_X_Q4_0 4 + #define MOE_Y_Q4_0 32 #define NWARPS_Q4_0 4 #endif @@ -149,8 +149,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2) const int exp_stride, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k) { - const int mmq_x = MMQ_X_Q4_0; - const int mmq_y = MMQ_Y_Q4_0; + const int mmq_x = MOE_X_Q4_0; + const int mmq_y = MOE_Y_Q4_0; const int nwarps = NWARPS_Q4_0; moe_q