Skip to content

Commit

Permalink
iqk_mul_mat: better srategy when nrc_y not divisible by ny (#71)
Browse files Browse the repository at this point in the history
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
  • Loading branch information
ikawrakow and Kawrakow authored Oct 1, 2024
1 parent fd20638 commit 8cba478
Showing 1 changed file with 31 additions and 8 deletions.
39 changes: 31 additions & 8 deletions ggml/src/iqk/iqk_mul_mat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,39 @@ struct MulMat {
while (!funcs[ny-1] && ny > 0) --ny;
int n_step = (nrc_y - info.cur_y)/ny;
if (n_step > 0) {
for (int ix = 0; ix < nrc_x; ix += k_x_step) {
auto this_info = info;
this_info.s += ix;
int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
for (int iy = 0; iy < n_step; ++iy) {
funcs[ny-1](n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
this_info.cur_y += ny;
if (n_step*ny != nrc_y) {
++n_step;
int ny1 = nrc_y/n_step;
int ny2 = ny1 + 1;
int my1 = n_step*ny2 - nrc_y;
int my2 = n_step - my1;
for (int ix = 0; ix < nrc_x; ix += k_x_step) {
auto this_info = info;
this_info.s += ix;
int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
for (int iy = 0; iy < my1; ++iy) {
funcs[ny1-1](n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
this_info.cur_y += ny1;
}
for (int iy = 0; iy < my2; ++iy) {
funcs[ny2-1](n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
this_info.cur_y += ny2;
}
}
info.cur_y += nrc_y;
}
else {
for (int ix = 0; ix < nrc_x; ix += k_x_step) {
auto this_info = info;
this_info.s += ix;
int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
for (int iy = 0; iy < n_step; ++iy) {
funcs[ny-1](n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
this_info.cur_y += ny;
}
}
info.cur_y += ny * n_step;
}
info.cur_y += ny * n_step;
}
int n_left = nrc_y - info.cur_y;
if (n_left > 0) {
Expand Down

0 comments on commit 8cba478

Please sign in to comment.