Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ggml/src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)

// backend registry

#define GGML_REG_MAX_BACKENDS 16
#define GGML_REG_MAX_BACKENDS 64

struct ggml_backend_reg {
char name[128];
Expand Down
49 changes: 13 additions & 36 deletions ggml/src/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3768,37 +3768,13 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));

const ggml_tensor_extra_gpu *src0_extra =
(const ggml_tensor_extra_gpu *)src0->extra;
const ggml_tensor_extra_gpu *src1_extra =
(const ggml_tensor_extra_gpu *)src1->extra;
const ggml_tensor_extra_gpu *dst_extra =
(const ggml_tensor_extra_gpu *)dst->extra;

ggml_tensor_extra_gpu src0_row_extra;
ggml_tensor_extra_gpu src1_row_extra;
ggml_tensor_extra_gpu dst_row_extra;

ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;

src1_row.backend = GGML_BACKEND_TYPE_GPU;
dst_row.backend = GGML_BACKEND_TYPE_GPU;

src0_row.extra = &src0_row_extra;
src1_row.extra = &src1_row_extra;
dst_row.extra = &dst_row_extra;

char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
? (char *)src0->data
: (char *)src0_extra->data_device[ctx.device];
char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
? (char *)src1->data
: (char *)src1_extra->data_device[ctx.device];
char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
? (char *)dst->data
: (char *)dst_extra->data_device[ctx.device];
char *src0_original = (char *)src0->data;
char *src1_original = (char *)src1->data;
char *dst_original = (char *)dst->data;

src0_row.ne[2] = 1;
src0_row.ne[3] = 1;
Expand Down Expand Up @@ -3827,12 +3803,9 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
const int64_t i1 = id;
const int64_t i2 = i12;

src0_row_extra.data_device[ctx.device] =
src0_original + i02*nb02;
src1_row_extra.data_device[ctx.device] =
src1_original + + i11*nb11 + i12*nb12;
dst_row_extra.data_device[ctx.device] =
dst_original + i1*nb1 + i2*nb2;
src0_row.data = src0_original + i02*nb02;
src1_row.data = src1_original + + i11*nb11 + i12*nb12;
dst_row.data = dst_original + i1*nb1 + i2*nb2;

ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
}
Expand All @@ -3841,8 +3814,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));

src1_row_extra.data_device[ctx.device] = src1_contiguous.get();
dst_row_extra.data_device[ctx.device] = dst_contiguous.get();
src1_row.data = src1_contiguous.get();
dst_row.data = dst_contiguous.get();

for (int64_t i02 = 0; i02 < n_as; i02++) {
int64_t num_src1_rows = 0;
Expand Down Expand Up @@ -3898,7 +3871,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
});
}

src0_row_extra.data_device[ctx.device] = src0_original + i02*nb02;
src0_row.data = src0_original + i02*nb02;

GGML_ASSERT(nb11 == sizeof(float)*ne10);
GGML_ASSERT(nb1 == sizeof(float)*ne0);
Expand Down Expand Up @@ -5221,6 +5194,10 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
return false;
}
}
ggml_type src0_type = op->src[0]->type;
if (src0_type == GGML_TYPE_BF16) {
return false;
}
return true;
} break;
case GGML_OP_GET_ROWS:
Expand Down
7 changes: 0 additions & 7 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5883,13 +5883,6 @@ static bool llm_load_tensors(

auto & hparams = model.hparams;

#ifdef GGML_USE_SYCL
// disable MoE with SYCL until mul_mat_id is updated
if (hparams.n_expert > 0) {
n_gpu_layers = 0;
}
#endif

model.split_mode = split_mode;
model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers;
Expand Down