Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml-cpu : add chunking support to mul_mat_id #11666

Merged
merged 8 commits into from
Feb 13, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 184 additions & 85 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
#include "ggml-cpu-impl.h"
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include "ggml-quants.h"
#include "ggml-cpu-quants.h"
#include "ggml-threading.h"
#include "amx/amx.h"
#include "ggml.h"

#if defined(_MSC_VER) || defined(__MINGW32__)
Expand Down Expand Up @@ -1291,7 +1289,7 @@ struct ggml_threadpool {
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
atomic_int GGML_CACHE_ALIGN n_barrier;
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.

// these are atomic as an annotation for thread-sanitizer
atomic_bool stop; // Used for stopping the threadpool altogether
Expand Down Expand Up @@ -7490,13 +7488,15 @@ UseGgmlGemm1:;
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;

const size_t nbw0 = ggml_type_size(vec_dot_type);
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
const size_t nbw2 = nbw1*ne11;
const size_t nbw3 = nbw2*ne12;

assert(params->wsize >= ne13*nbw3);
GGML_ASSERT(src1->type == GGML_TYPE_F32);

#if 0
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
Expand All @@ -7506,6 +7506,20 @@ UseGgmlGemm1:;
}
}
}
#else
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
size_t bs = ggml_blck_size(vec_dot_type);
int64_t ne10_block_start = (ith * ne10/bs) / nth;
int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
(ne10_block_end - ne10_block_start) * bs);
}
}
}
#endif
}

if (ith == 0) {
Expand Down Expand Up @@ -7593,7 +7607,6 @@ UseGgmlGemm2:;
if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
num_rows_per_vec_dot = 1;
}

ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);

if (nth >= nchunk0 * nchunk1) {
Expand All @@ -7606,6 +7619,84 @@ UseGgmlGemm2:;

// ggml_compute_forward_mul_mat_id

#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ids->ne[0]*ids->ne[1] + (i1)]

struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};

static void ggml_compute_forward_mul_mat_id_one_chunk(
struct ggml_tensor * dst,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * ids,
const int64_t cur_a,
const int64_t ir0_start,
const int64_t ir0_end,
const int64_t ir1_start,
const int64_t ir1_end,
const char * src0_cur,
const struct mmid_row_mapping * matrix_rows,
const size_t row_size,
const bool src1_cont,
const void * wdata) {

GGML_TENSOR_BINARY_OP_LOCALS

const enum ggml_type type = src0->type;

ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;

const int64_t blck_0 = 16;
const int64_t blck_1 = 16;

float tmp[16];

for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {
const int64_t _i12 = ir1; // logical row index for this expert

struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
const int id = row_mapping.i1; // selected expert index

const int64_t i11 = id % ne11;
const int64_t i12 = row_mapping.i2; // row index in src1

const int64_t i1 = id; // selected expert index
const int64_t i2 = i12; // row

// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11)*row_size
: (i11*nb11 + i12*nb12));

float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));

for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
}

memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
}
}
}
}

static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {

void * ptr = *p;
ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
*p = (void *) ((char *) ptr + size);
return ptr;
}

static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
Expand All @@ -7623,7 +7714,6 @@ static void ggml_compute_forward_mul_mat_id(

const bool src1_cont = ggml_is_contiguous(src1);

ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;

Expand All @@ -7641,41 +7731,60 @@ static void ggml_compute_forward_mul_mat_id(
const int n_ids = ids->ne[0]; // n_expert_used
const int n_as = ne02; // n_expert

char * wdata_src1_end = (src1->type == vec_dot_type) ?
(char *) params->wdata :
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
void * wdata_cur = params->wdata;

struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};
if (src1->type != vec_dot_type) {
incr_ptr_aligned(&wdata_cur, ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
}

int64_t * matrix_row_counts = // [n_as]
incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t));

struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]]
incr_ptr_aligned(&wdata_cur, n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t));

int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as]
incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE);

GGML_ASSERT(params->wsize >= (size_t)((char *) wdata_cur - (char *) params->wdata));

if (src1->type != vec_dot_type) {
char * wdata = params->wdata;

const size_t nbw0 = ggml_type_size(vec_dot_type);
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
const size_t nbw2 = nbw1*ne11;
const size_t nbw3 = nbw2*ne12;

assert(params->wsize >= ne13*nbw3);
GGML_ASSERT(src1->type == GGML_TYPE_F32);

#if 0
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
for (int64_t i12 = ith; i12 < ne12; i12 += nth) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
ne10);
}
}
}
#else
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
size_t bs = ggml_blck_size(vec_dot_type);
int64_t ne10_block_start = (ith * ne10/bs) / nth;
int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
(ne10_block_end - ne10_block_start) * bs);
}
}
}
#endif
}

#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]

if (ith == 0) {
// initialize matrix_row_counts
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
Expand All @@ -7693,94 +7802,79 @@ static void ggml_compute_forward_mul_mat_id(
}
}

// reset current_chunk
for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
*current_chunk_ctr = nth;
}

ggml_barrier(params->threadpool);

// compute each matrix multiplication in sequence
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
const int64_t cne1 = matrix_row_counts[cur_a];

if (cne1 == 0) {
continue;
}

const char * src0_cur = (const char *) src0->data + cur_a*nb02;

const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const char * src0_cur = (const char *) src0->data + cur_a * nb02;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);

const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1; // src1 rows

// distribute the thread work across the inner or outer loop based on which one is larger

const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows

const int64_t ith0 = ith % nth0;
const int64_t ith1 = ith / nth0;

const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;

const int64_t ir010 = dr0*ith0;
const int64_t ir011 = MIN(ir010 + dr0, nr0);
const int64_t nr0 = ne01;
const int64_t nr1 = cne1;

const int64_t ir110 = dr1*ith1;
const int64_t ir111 = MIN(ir110 + dr1, nr1);

// threads with no work simply yield (not sure if it helps)
//if (ir010 >= ir011 || ir110 >= ir111) {
// sched_yield();
// continue;
//}
int chunk_size = 16;
if (nr0 == 1 || nr1 == 1) {
chunk_size = 64;
}

// block-tiling attempt
const int64_t blck_0 = 16;
const int64_t blck_1 = 16;
#if defined(__aarch64__)
// disable for ARM
const bool disable_chunking = true;
#else
// disable for NUMA
const bool disable_chunking = ggml_is_numa();
#endif // defined(__aarch64__)

// attempt to reduce false-sharing (does not seem to make a difference)
float tmp[16];
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;

for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
const int64_t _i12 = ir1; // logical row index for this expert
if (nchunk0 * nchunk1 < nth * 4 || disable_chunking) {
nchunk0 = nr0 > nr1 ? nth : 1;
nchunk1 = nr0 > nr1 ? 1 : nth;
}
Comment on lines +7832 to +7846
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same change could be applied to the regular mul_mat if it causes a regression on some ARM devices.


struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
const int id = row_mapping.i1; // selected expert index
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;

const int64_t i11 = id % ne11;
const int64_t i12 = row_mapping.i2; // row index in src1
int current_chunk = ith;

const int64_t i1 = id; // selected expert index
const int64_t i2 = i12; // row
atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);

// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11)*row_size
: (i11*nb11 + i12*nb12));
while (current_chunk < nchunk0 * nchunk1) {
const int64_t ith0 = current_chunk % nchunk0;
const int64_t ith1 = current_chunk / nchunk0;

float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
const int64_t ir0_start = dr0 * ith0;
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);

//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
//}
const int64_t ir1_start = dr1 * ith1;
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);

for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
}
ggml_compute_forward_mul_mat_id_one_chunk(
dst, src0, src1, ids, cur_a,
ir0_start, ir0_end, ir1_start, ir1_end,
src0_cur, matrix_rows, row_size, src1_cont, wdata
);

memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
}
if (nth >= nchunk0 * nchunk1) {
break;
}

current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
}
}

#undef MMID_MATRIX_ROW
}

// ggml_compute_forward_out_prod
Expand Down Expand Up @@ -13717,14 +13811,19 @@ struct ggml_cplan ggml_graph_plan(
cur = 0;
const struct ggml_tensor * src0 = node->src[0];
const struct ggml_tensor * src1 = node->src[1];
const struct ggml_tensor * ids = node->src[2];
const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
const int n_as = src0->ne[2];
// src1
if (src1->type != vec_dot_type) {
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)) + sizeof(int64_t);
}
const int n_as = src0->ne[2];
cur += GGML_PAD(cur, sizeof(int64_t)); // align
cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
// matrix_row_counts
cur += n_as * sizeof(int64_t) + sizeof(int64_t);
// matrix_rows
cur += n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t);
// atomic_current_chunk
cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE;
} break;
case GGML_OP_OUT_PROD:
{
Expand Down
Loading