diff --git a/examples/mulmat-tune/bench-out/7b.q4_0.accelerate.txt b/examples/mulmat-tune/bench-out/7b.q4_0.accelerate.txt index 294b0eba2cd517..2f9f22f921a88e 100644 --- a/examples/mulmat-tune/bench-out/7b.q4_0.accelerate.txt +++ b/examples/mulmat-tune/bench-out/7b.q4_0.accelerate.txt @@ -3,38 +3,38 @@ -1 0 0 3 0 1 -1 0 0 0 1 0 3 0 1 -1 0 0 4096 4096 - 16 23 14046 0 0 0 0 11366 6297 0 - 32 36 26793 0 0 0 0 11244 6201 0 - 48 55 40187 0 0 0 0 11316 7811 0 - 64 78 54450 0 0 0 0 11149 7859 0 - 80 96 68095 0 0 0 0 11258 8748 0 - 96 114 81588 0 0 0 0 11017 10248 0 -112 134 96596 0 0 0 0 11186 10506 0 -128 157 112871 0 0 0 0 11179 11887 0 + 16 17 14400 0 0 20380 0 13643 6406 0 + 32 48 26184 0 0 17892 0 12759 6875 0 + 48 62 40950 0 0 20940 0 11344 6470 0 + 64 75 54959 0 0 19897 0 12056 8272 0 + 80 95 69812 0 0 23261 0 13296 10944 0 + 96 135 82530 0 0 20238 0 11363 9733 0 +112 135 97063 0 0 21620 0 11008 10231 0 +128 160 110596 0 0 22374 0 11130 12202 0 4096 11008 - 16 55 36520 0 0 0 0 29851 9467 0 - 32 103 73460 0 0 0 0 29815 11175 0 - 48 173 109619 0 0 0 0 29870 13368 0 - 64 206 147174 0 0 0 0 29571 16828 0 - 80 289 178721 0 0 0 0 29895 18013 0 - 96 343 219130 0 0 0 0 29633 21457 0 -112 550 257754 0 0 0 0 30342 23557 0 -128 594 298395 0 0 0 0 29683 24796 0 + 16 63 34214 0 0 43145 0 30377 9875 0 + 32 98 71625 0 0 43591 0 29675 11653 0 + 48 155 109818 0 0 44130 0 30964 14123 0 + 64 253 144841 0 0 46174 0 29843 17059 0 + 80 279 175670 0 0 47225 0 29574 16913 0 + 96 331 217921 0 0 48978 0 29582 19354 0 +112 408 254362 0 0 53326 0 29963 22962 0 +128 611 281834 0 0 57593 0 30629 25448 0 11008 4096 - 16 19 35077 0 0 0 0 30130 21051 0 - 32 43 71844 0 0 0 0 29937 21740 0 - 48 56 108664 0 0 0 0 30534 23017 0 - 64 94 148288 0 0 0 0 29848 26486 0 - 80 108 187098 0 0 0 0 29896 29687 0 - 96 116 224466 0 0 0 0 29931 31416 0 -112 137 264372 0 0 0 0 29797 34035 0 -128 178 300958 0 0 0 0 29713 37036 0 + 16 18 35422 0 0 53263 0 30608 20630 0 + 32 37 69747 0 0 54542 0 30501 23162 0 + 48 53 107693 0 0 56207 0 29500 23522 0 + 64 99 144891 0 0 60231 0 29461 23695 0 + 80 98 178384 0 0 60697 0 29281 26783 0 + 96 112 217583 0 0 63507 0 29741 31710 0 +112 170 253402 0 0 65329 0 28823 34861 0 +128 189 290395 0 0 70656 0 31023 35913 0 32000 4096 - 16 18 105077 0 0 0 0 87731 67479 0 - 32 36 205088 0 0 0 0 86620 72865 0 - 48 54 314438 0 0 0 0 87458 77700 0 - 64 75 420397 0 0 0 0 86515 83575 0 - 80 109 541305 0 0 0 0 86580 88873 0 - 96 121 646842 0 0 0 0 86500 96982 0 -112 134 761083 0 0 0 0 87326 102948 0 -128 155 872466 0 0 0 0 87668 112924 0 \ No newline at end of file + 16 18 104453 0 0 146992 0 86361 67977 0 + 32 36 203698 0 0 150361 0 87629 71108 0 + 48 89 312316 0 0 155162 0 86803 76783 0 + 64 104 428321 0 0 161366 0 89776 82720 0 + 80 93 532930 0 0 171931 0 87039 88321 0 + 96 113 642233 0 0 176509 0 86327 95598 0 +112 169 745426 0 0 186020 0 87538 102664 0 +128 202 860052 0 0 196480 0 88918 109959 0 diff --git a/examples/mulmat-tune/mulmat-tune.c b/examples/mulmat-tune/mulmat-tune.c index b7f0744e35e92e..d9b31d76ca4f83 100644 --- a/examples/mulmat-tune/mulmat-tune.c +++ b/examples/mulmat-tune/mulmat-tune.c @@ -79,25 +79,31 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { return rc; } - tune->items = malloc(sizeof(struct ggml_mulmat_tune_m) * - (tune->n_shapes * tune->n_profiles * tune->m_num)); - if (tune->items == NULL) { - fprintf(stderr, "failed to allocate memory\n"); - return -2; + { + size_t item_size = sizeof(struct ggml_mulmat_tune_m) * + (tune->n_shapes * tune->n_profiles * tune->m_num); + tune->items = malloc(item_size); + if (tune->items == NULL) { + fprintf(stderr, "failed to allocate memory\n"); + return -2; + } + memset(tune->items, 0, item_size); } - size_t sz = sizeof(struct ggml_task_profile) * tune->n_profiles; - tune->profiles = malloc(sz); - GGML_ASSERT(tune->profiles); - memset(tune->profiles, 0, sz); + { + size_t sz = sizeof(struct ggml_task_profile) * tune->n_profiles; + tune->profiles = malloc(sz); + GGML_ASSERT(tune->profiles); + memset(tune->profiles, 0, sz); + } for (int ip = 0; ip < tune->n_profiles; ip++) { struct ggml_task_profile *profile = &tune->profiles[ip]; for (int j = 0; j < 3; j++) { struct ggml_task_stage *ts = &profile->stages[j]; int backend, parallel, wait; - rc = fscanf(fp, "%d %d %d", &backend, ¶llel, &wait); - if (rc <= 0) { + if (rc = fscanf(fp, "%d %d %d", &backend, ¶llel, &wait), + rc <= 0) { return rc; } ts->backend = backend; @@ -107,9 +113,9 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { } for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { - rc = fscanf(fp, "%d %d", &tune->shapes[i_shape].N, - &tune->shapes[i_shape].K); - if (rc <= 0) { + if (rc = fscanf(fp, "%d %d", &tune->shapes[i_shape].N, + &tune->shapes[i_shape].K), + rc <= 0) { return rc; } @@ -117,8 +123,7 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { int M; for (int ip = 0; ip < tune->n_profiles; ip++) { if (ip == 0) { - rc = fscanf(fp, "%d", &M); - if (rc <= 0) { + if (rc = fscanf(fp, "%d", &M), rc <= 0) { return rc; } } @@ -126,9 +131,9 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { ggml_mulmat_tune_get_item_index(tune, i_shape, ip, i_m); struct ggml_mulmat_tune_m *item = &tune->items[index]; item->M = M; - rc = fscanf(fp, "%d %d %d", &item->stages_time[0], - &item->stages_time[1], &item->stages_time[2]); - if (rc <= 0) { + if (rc = fscanf(fp, "%d %d %d", &item->stages_time[0], + &item->stages_time[1], &item->stages_time[2]), + rc <= 0) { return rc; } } @@ -139,11 +144,12 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { } int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp) { - int rc = fprintf(fp, "%d %s %d %s %d %s %d %d %d %d\n", tune->version, + int rc; + if (rc = fprintf(fp, "%d %s %d %s %d %s %d %d %d %d\n", tune->version, tune->model, tune->type, tune->type_name, tune->backend, tune->blas_vendor, tune->n_shapes, tune->m_step, - tune->m_num, tune->n_profiles); - if (rc <= 0) { + tune->m_num, tune->n_profiles), + rc <= 0) { return rc; } @@ -151,28 +157,25 @@ int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp) { struct ggml_task_profile *profile = &tune->profiles[i]; for (int j = 0; j < 3; j++) { struct ggml_task_stage *ts = &profile->stages[j]; - rc = fprintf(fp, "%2d %d %d", ts->backend, - ts->parallel ? 1 : 0, ts->wait ? 1 : 0); - if (rc <= 0) { + if (rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0, + ts->wait ? 1 : 0), + rc <= 0) { return rc; } if (j < 2) { - rc = fprintf(fp, " "); - if (rc <= 0) { + if (rc = fprintf(fp, " "), rc <= 0) { return rc; } } } - rc = fprintf(fp, "\n"); - if (rc <= 0) { + if (rc = fprintf(fp, "\n"), rc <= 0) { return rc; } } for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { const struct ggml_mulmat_tune_nk *shape = &tune->shapes[i_shape]; - rc = fprintf(fp, "%d %d\n", shape->N, shape->K); - if (rc <= 0) { + if (rc = fprintf(fp, "%d %d\n", shape->N, shape->K), rc <= 0) { return rc; } @@ -182,8 +185,7 @@ int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp) { ggml_mulmat_tune_get_item_index(tune, i_shape, ip, i_m); struct ggml_mulmat_tune_m *item = &tune->items[index]; if (ip == 0) { - rc = fprintf(fp, "%3d", item->M); - if (rc <= 0) { + if (rc = fprintf(fp, "%3d", item->M), rc <= 0) { return rc; } } @@ -191,20 +193,18 @@ int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp) { struct ggml_task_profile *profile = &tune->profiles[ip]; for (int k = 0; k < 3; k++) { if (profile->stages[k].backend != GGML_BACKEND_UNKNOWN) { - rc = fprintf(fp, "%9d", item->stages_time[k]); - if (rc <= 0) { + if (rc = fprintf(fp, "%9d", item->stages_time[k]), + rc <= 0) { return rc; } } else { - rc = fprintf(fp, " 0"); - if (rc <= 0) { + if (rc = fprintf(fp, " 0"), rc <= 0) { return rc; } } } } - rc = fprintf(fp, "\n"); - if (rc <= 0) { + if (rc = fprintf(fp, "\n"), rc <= 0) { return rc; } } @@ -298,8 +298,8 @@ void ggml_mulmat_tune_estimate_time( if (ts->parallel) { t /= nth; } - time_stats->profile_time[ip].stage_time[stage] = t; - time_stats->profile_time[ip].total_time += t; + time_stats->profile_time[ip].stage_time[stage] = (int)t; + time_stats->profile_time[ip].total_time += (int)t; } } } @@ -313,7 +313,7 @@ static const char *ggml_backend_names[] = { const char *ggml_get_backend_name(enum ggml_backend backend) { if (backend == GGML_BACKEND_UNKNOWN) { - return ""; + return "UNKNOWN"; } return ggml_backend_names[backend]; } diff --git a/examples/mulmat-tune/mulmat-tune.h b/examples/mulmat-tune/mulmat-tune.h index 5dee88935c1289..f98405795013b6 100644 --- a/examples/mulmat-tune/mulmat-tune.h +++ b/examples/mulmat-tune/mulmat-tune.h @@ -13,7 +13,7 @@ extern "C" { #define GGML_MULMAT_MAX_PROFILES 4 struct ggml_task_stage { - int backend; // enum ggml_backend + /*enum ggml_backend*/ int backend; bool parallel; bool wait; }; @@ -99,7 +99,6 @@ void ggml_mulmat_init_task_profiles(/*enum ggml_backend*/ int backend); int ggml_mulmat_get_task_profiles(struct ggml_task_profile **profiles, int src0_type, int src1_type); -// returns enum ggml_backend /*enum ggml_backend*/ int ggml_auto_detect_backend(void); const char *ggml_get_backend_name(/*enum ggml_backend*/ int backend); diff --git a/ggml.c b/ggml.c index a962dcf16a2b6b..90022425b51db6 100644 --- a/ggml.c +++ b/ggml.c @@ -9548,14 +9548,6 @@ static void ggml_compute_forward_rms_norm_back( // } // #endif - - - - - -// TODO: allow compile CUDA/CL along with CBLAS - - static void ggml_compute_forward_mul_mat_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10074,11 +10066,12 @@ static void ggml_compute_forward_mul_mat_q_f32( 0.0f, d, ne01); } } + return; } - return; GGML_ASSERT(nth == 1); GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + GGML_ASSERT(init_backend == GGML_BACKEND_UNKNOWN); GGML_ASSERT(compute_backend == GGML_BACKEND_CBLAS); float * const wdata = params->wdata; @@ -14432,13 +14425,19 @@ void ggml_graph_compute_mul_mat_set_task_profile(struct ggml_cgraph *cgraph) { e->N = N; e->K = K; e->profile = profile; + + GGML_PRINT_THREAD_DEBUG("(1) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n", + M, N, K, + profile->stages[0].backend, + profile->stages[1].backend, + profile->stages[2].backend); } } } memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); - GGML_PRINT_THREAD_DEBUG("M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n", + GGML_PRINT_THREAD_DEBUG("(2) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n", M, N, K, profile->stages[0].backend, profile->stages[1].backend, @@ -14504,14 +14503,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; + struct ggml_task_stage *stages = node->task_profile.stages; switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; size_t cur = 0; if (ggml_is_quantized(node->type)) { cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; @@ -14522,10 +14520,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_ADD: case GGML_OP_ADD1: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - .parallel = true, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; @@ -14537,13 +14533,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_ACC: { - node->task_profile.stages[GGML_TASK_INIT] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - .parallel = true, - }; + stages[GGML_TASK_INIT].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; @@ -14568,9 +14560,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_STEP: case GGML_OP_RELU: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; } break; case GGML_OP_MUL: case GGML_OP_GELU: @@ -14580,28 +14570,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - .parallel = true, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_MUL_MAT: { size_t cur = 0; + enum ggml_backend compute_backend = stages[GGML_TASK_COMPUTE].backend; #if defined(GGML_USE_CUBLAS) - if (node->task_profile.stages[GGML_TASK_COMPUTE].backend != GGML_BACKEND_CPU) { + if (!= GGML_BACKEND_CPU) { cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node); } else #elif defined(GGML_USE_CLBLAST) - if (node->task_profile.stages[GGML_TASK_COMPUTE].backend != GGML_BACKEND_CPU) { + if (compute_backend!= GGML_BACKEND_CPU) { cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); } else #endif if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (node->task_profile.stages[GGML_TASK_COMPUTE].backend != GGML_BACKEND_CPU) { + if (compute_backend != GGML_BACKEND_CPU) { // here we need memory just for single 2D matrix from src0 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); } else { @@ -14614,7 +14603,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) cur = 0; } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (node->task_profile.stages[GGML_TASK_COMPUTE].backend != GGML_BACKEND_CPU) { + if (compute_backend != GGML_BACKEND_CPU) { cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); } else #endif @@ -14630,19 +14619,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_SCALE: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - .parallel = true, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_SET: { - node->task_profile.stages[GGML_TASK_INIT] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; + stages[GGML_TASK_INIT].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; } break; case GGML_OP_CONT: case GGML_OP_RESHAPE: @@ -14654,42 +14637,30 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_ZERO: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - .parallel = true, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_ALIBI: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; } break; case GGML_OP_CLAMP: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: { - node->task_profile.stages[GGML_TASK_INIT] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - .parallel = true, - }; + stages[GGML_TASK_INIT].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; GGML_ASSERT(node->src0->ne[3] == 1); GGML_ASSERT(node->src1->ne[2] == 1); @@ -14718,10 +14689,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_FLASH_ATTN: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - .parallel = true, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; @@ -14741,10 +14710,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_FLASH_FF: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - .parallel = true, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; if (node->src1->type == GGML_TYPE_F32) { @@ -14762,15 +14729,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; } break; case GGML_OP_NONE: { - node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){ - .backend = GGML_BACKEND_CPU, - }; + stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU; } break; case GGML_OP_COUNT: {