diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 7e4069d76b2595..0108759632c5f4 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o const enum ggml_type type = src->type; const size_t ts = ggml_type_size(type); const size_t bs = ggml_blck_size(type); + const uint64_t row_size = ts*ne0/bs; - const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); - if (nb0 == ts && nb1 == ts*ne0/bs) { - err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev); - return err; + const char * x = (const char *) src->data + i2*nb2 + i3*nb3; + if (nb0 == ts && nb1 == row_size) { + return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev); } if (nb0 == ts) { const size_t buffer_origin[3] = { offset, 0, 0 }; const size_t host_origin[3] = { 0, 0, 0 }; - const size_t region[3] = { ts*ne0/bs, ne1, 1 }; - err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev); - return err; + const size_t region[3] = { row_size, ne1, 1 }; + return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev); } + std::vector events; + if (ev && ne1>1) events.reserve(ne1-1); for (uint64_t i1 = 0; i1 < ne1; i1++) { // pretend the row is a matrix with cols=1 - const size_t buffer_origin[3] = { offset, i1, 0 }; + const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 }; const size_t host_origin[3] = { 0, 0, 0 }; - const size_t region[3] = { ts/bs, ne0, 1 }; - err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev); + const size_t region[3] = { ts, ne0/bs, 1 }; + // if an event is requested, make the last write wait for all previous writes to complete + if (ev && i1) { + events.push_back(*ev); + } + cl_uint nevents = i1 == ne1-1 ? events.size() : 0U; + err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev); if (err != CL_SUCCESS) { - break; + for (auto event : events) { + clReleaseEvent(event); + } + return err; } } - return err; + for (auto event : events) { + CL_CHECK(clReleaseEvent(event)); + } + return CL_SUCCESS; } static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -1888,17 +1900,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { const int64_t ne3 = tensor->ne[3]; const ggml_type type = tensor->type; - const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type); + const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type)); + const size_t q_sz = s_sz * (size_t) (ne2 * ne3); size_t q_size; cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size); tensor->data = data; // copy tensor to device + size_t offset = 0; for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = 0; i2 < ne2; i2++) { - int i = i3*ne2 + i2; - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL)); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL)); + offset += s_sz; } }