Skip to content

Commit

Permalink
CUDA: fix MMQ stream-k rounding if ne00 % 128 != 0 ggerganov#8311
Browse files Browse the repository at this point in the history
Co-Authored-By: Johannes Gäßler <johannesg@5d6.de>
  • Loading branch information
Nexesenex and JohannesGaessler committed Jul 4, 2024
1 parent 85ee1bc commit 3a9282f
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions ggml-cuda/mmq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2446,8 +2446,11 @@ static __global__ void mul_mat_q(
const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y

// kbc == k block continuous, current index in continuous ijk space.
int64_t kbc = GGML_PAD((int64_t) blockIdx.x *blocks_per_ne00*ntx*nty / gridDim.x, blocks_per_warp);
const int64_t kbc_stop = GGML_PAD((int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x, blocks_per_warp);
int64_t kbc = (int64_t) blockIdx.x *blocks_per_ne00*ntx*nty / gridDim.x;
int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x;

kbc -= (kbc % blocks_per_ne00) % blocks_per_warp;
kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_warp;

// kb0 == k index when doing the matrix multiplication for an output tile.
int kb0_start = kbc % blocks_per_ne00;
Expand Down Expand Up @@ -2503,8 +2506,11 @@ static __global__ void mul_mat_q_stream_k_fixup(
const int bidx_stop = (blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq / (gridDim.y*gridDim.x) + 1;

for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) {
const int64_t kbc = GGML_PAD((int64_t) bidx *blocks_per_ne00*ntx*nty / block_num_mmq, blocks_per_warp);
const int64_t kbc_stop = GGML_PAD((int64_t)(bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq, blocks_per_warp);
int64_t kbc = (int64_t) bidx *blocks_per_ne00*ntx*nty / block_num_mmq;
int64_t kbc_stop = (int64_t)(bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq;

kbc -= (kbc % blocks_per_ne00) % blocks_per_warp;
kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_warp;

// Skip fixup tile if the MMQ CUDA block never wrote anything to it:
if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) {
Expand Down

0 comments on commit 3a9282f

Please sign in to comment.