diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 090aa2ce04576..38f0885870df3 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1477,10 +1477,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + const float alpha = 1.0f; const float beta = 0.0f; const int x_ne = ne01 * ne00; @@ -1501,15 +1506,21 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr size_t x_offset = 0; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i13 = 0; i13 < ne13; i13++) { + int64_t i03 = i13 / r3; + + for (int64_t i12 = 0; i12 < ne12; i12++) { + int64_t i02 = i12 / r2; + // copy data to device if (src0->backend != GGML_BACKEND_GPU) { - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); + if (i12 % r2 == 0) { + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); + } } else { x_offset = (i03 * ne02 + i02) * x_ne; } - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); CL_CHECK(clFinish(queue)); @@ -1530,7 +1541,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); } } @@ -1552,6 +1563,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; const int nb10 = src1->nb[0]; const int nb11 = src1->nb[1]; @@ -1561,6 +1574,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f); const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f); const int x_ne = ne01 * ne00; @@ -1584,26 +1600,32 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr size_t x_offset = 0; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i13 = 0; i13 < ne13; i13++) { + int64_t i03 = i13 / r3; + + for (int64_t i12 = 0; i12 < ne12; i12++) { + int64_t i02 = i12 / r2; + // copy src0 to device if (src0->backend != GGML_BACKEND_GPU) { - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); + if (i12 % r2 == 0) { + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); + } } else { x_offset = (i03 * ne02 + i02) * x_ne; } // convert src1 to fp16 // TODO: use multiple threads - ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); - char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; + ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12); + char * src1i = (char *) src1->data + i13*nb13 + i12*nb12; if (src1_cont_rows) { if (src1_cont_cols) { ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); } else { - for (int64_t i01 = 0; i01 < ne11; i01++) { - ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); + for (int64_t i11 = 0; i11 < ne11; i11++) { + ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10); } } } @@ -1640,7 +1662,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr // copy dst to host, then convert to float CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); ggml_fp16_to_fp32_row(tmp, d, d_ne); } @@ -1661,12 +1683,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; const ggml_type type = src0->type; const bool mul_mat_vec = ne11 == 1; + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + const float alpha = 1.0f; const float beta = 0.0f; const int x_ne = ne01 * ne00; @@ -1699,12 +1726,18 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * size_t ev_idx = 0; std::vector events; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i13 = 0; i13 < ne13; i13++) { + int64_t i03 = i13 / r3; + + for (int64_t i12 = 0; i12 < ne12; i12++) { + int64_t i02 = i12 / r2; + // copy src0 to device if necessary if (src0->backend == GGML_BACKEND_CPU) { - events.emplace_back(); - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); + if (i12 % r2 == 0) { + events.emplace_back(); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); + } } else if (src0->backend == GGML_BACKEND_GPU) { d_Q = (cl_mem) src0->data; } else { @@ -1713,7 +1746,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel // copy src1 to device events.emplace_back(); - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++)); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++)); // compute const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; @@ -1735,7 +1768,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); events.emplace_back(); @@ -1759,7 +1792,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL)); for (auto *event : events) { clReleaseEvent(event); diff --git a/ggml.c b/ggml.c index 44c43b42409a9..5c3c8de3c30e0 100644 --- a/ggml.c +++ b/ggml.c @@ -10624,11 +10624,6 @@ static void ggml_compute_forward_mul_mat( #if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { - // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension - // ref: https://github.com/ggerganov/ggml/pull/224 - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); }