Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multi-thread ggml_cpy() #824

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 41 additions & 17 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4936,16 +4936,26 @@ static void ggml_compute_forward_dup_f16(
int64_t i12 = 0;
int64_t i13 = 0;

const int thread_num = params->ith;
const int total_threads = params->nth;

int region_index = 0;

if (dst->type == GGML_TYPE_F16) {
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);

memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
// Interleave execution so that in a 4 thread run thread 0 copies regions 0,4,8, ...
if ((region_index++ % total_threads) == thread_num) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);

memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
}

// Regardless, we have to keep the dst counters updated
if (++i10 == ne00) {
i10 = 0;
if (++i11 == ne01) {
Expand All @@ -4967,11 +4977,16 @@ static void ggml_compute_forward_dup_f16(
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);

*(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
// Interleave execution so that in a 4 thread run thread 0 copies regions 0,4,8, ...
if ((region_index++ % total_threads) == thread_num) {
Copy link
Contributor

@Fabio3rs Fabio3rs Apr 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible to change how the loop counts to achieve the same without this branch?

Making the threads reading/writing contiguous memory regions theoretically can have a better performance because how the branch prediction and memory chunks loading mechanisms (cache line size for example) works.

const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);

*(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
}

// Regardless, we have to keep the dst counters updated
if (++i10 == ne00) {
i10 = 0;
if (++i11 == ne01) {
Expand All @@ -4997,7 +5012,6 @@ static void ggml_compute_forward_dup_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
GGML_ASSERT(params->ith == 0);
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));

if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
Expand Down Expand Up @@ -5030,16 +5044,26 @@ static void ggml_compute_forward_dup_f32(
int64_t i12 = 0;
int64_t i13 = 0;

const int thread_num = params->ith;
const int total_threads = params->nth;

int region_index = 0;

if (dst->type == GGML_TYPE_F32) {
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);

memcpy(dst_ptr, src0_ptr, sizeof(float));
// Interleave execution so that in a 4 thread run thread 0 copies regions 0,4,8, ...
if ((region_index++ % total_threads) == thread_num) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);

memcpy(dst_ptr, src0_ptr, sizeof(float));
}

// Regardless, we have to keep the dst counters updated
if (++i10 == dst->ne[0]) {
i10 = 0;
if (++i11 == dst->ne[1]) {
Expand All @@ -5061,11 +5085,14 @@ static void ggml_compute_forward_dup_f32(
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
if ((region_index++ % total_threads) == thread_num) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);

*(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
*(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
}

// Regardless, we have to keep the dst counters updated
if (++i10 == dst->ne[0]) {
i10 = 0;
if (++i11 == dst->ne[1]) {
Expand Down Expand Up @@ -9441,7 +9468,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
{
node->n_tasks = n_threads;
} break;
case GGML_OP_CPY:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
Expand All @@ -9451,10 +9477,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
{
node->n_tasks = 1;
} break;
case GGML_OP_CPY:
case GGML_OP_SOFT_MAX:
{
node->n_tasks = n_threads;
} break;
case GGML_OP_ROPE:
{
node->n_tasks = n_threads;
Expand Down