From caff3db90a57c098f0dc7efa9f738453a4e83525 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 22 Dec 2023 11:23:23 +0200 Subject: [PATCH 1/6] sync : llama.cpp (ggml_scale, ggml_row_size, ggml_mul_mat_set_prec) ggml-ci --- examples/dolly-v2/main.cpp | 41 ++- examples/gpt-2/main-alloc.cpp | 46 ++- examples/gpt-2/main-backend.cpp | 9 +- examples/gpt-2/main-batched.cpp | 43 +-- examples/gpt-2/main-ctx.cpp | 44 ++- examples/gpt-2/main.cpp | 17 +- examples/gpt-j/main.cpp | 37 +- examples/gpt-neox/main.cpp | 41 ++- examples/mnist/main.cpp | 8 +- examples/mpt/main.cpp | 25 +- examples/replit/main.cpp | 25 +- examples/sam/main.cpp | 136 ++++---- examples/starcoder/main.cpp | 43 ++- examples/starcoder/starcoder-mmap.cpp | 3 +- examples/whisper/whisper.cpp | 32 +- include/ggml/ggml-backend.h | 7 + include/ggml/ggml.h | 53 ++- src/ggml-alloc.c | 16 +- src/ggml-backend-impl.h | 20 +- src/ggml-backend.c | 80 ++++- src/ggml-cuda.cu | 442 +++++++++++++++--------- src/ggml-metal.h | 3 + src/ggml-metal.m | 234 ++++++++++--- src/ggml-metal.metal | 13 +- src/ggml.c | 472 ++++++++++++++++++-------- tests/test-backend-ops.cpp | 19 +- tests/test-conv1d.cpp | 4 +- tests/test-conv2d.cpp | 4 +- tests/test-grad0.cpp | 10 +- tests/test-mul-mat.cpp | 4 +- tests/test-quantize-perf.cpp | 10 +- tests/test0.c | 6 +- 32 files changed, 1225 insertions(+), 722 deletions(-) diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp index cecc5626b..9e3599608 100644 --- a/examples/dolly-v2/main.cpp +++ b/examples/dolly-v2/main.cpp @@ -192,34 +192,34 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte + ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g - //ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b + ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g + //ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd*n_embd)); // c_attn_proj_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v ctx_size += (6 + 16*n_layer)*512; // object overhead @@ -580,8 +580,7 @@ bool dollyv2_eval( struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); + 1.0f/sqrt(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); diff --git a/examples/gpt-2/main-alloc.cpp b/examples/gpt-2/main-alloc.cpp index c4bf986e3..c0a684696 100644 --- a/examples/gpt-2/main-alloc.cpp +++ b/examples/gpt-2/main-alloc.cpp @@ -165,33 +165,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // wte + ctx_size += ggml_row_size(GGML_TYPE_F32 , n_ctx*n_embd); // wpe + ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v ctx_size += (6 + 12*n_layer)*512; // object overhead @@ -427,12 +427,6 @@ struct ggml_cgraph * gpt2_graph( } } - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(allocr, KQ_scale); - if (!ggml_allocr_is_measure(allocr)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - // wte + wpe struct ggml_tensor * inpL = ggml_add(ctx0, @@ -528,7 +522,7 @@ struct ggml_cgraph * gpt2_graph( struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, - KQ_scale); + 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // [n_past + N, N, 12] diff --git a/examples/gpt-2/main-backend.cpp b/examples/gpt-2/main-backend.cpp index 9ef873ece..dfda108ab 100644 --- a/examples/gpt-2/main-backend.cpp +++ b/examples/gpt-2/main-backend.cpp @@ -484,13 +484,6 @@ struct ggml_cgraph * gpt2_graph( } } - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(allocr, KQ_scale); - if (!ggml_allocr_is_measure(allocr)) { - float s = 1.0f/sqrtf(float(n_embd)/n_head); - ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s)); - } - // wte + wpe struct ggml_tensor * inpL = ggml_add(ctx0, @@ -586,7 +579,7 @@ struct ggml_cgraph * gpt2_graph( struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, - KQ_scale); + 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // [n_past + N, N, 12] diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp index 8c998f0ba..926c6f74f 100644 --- a/examples/gpt-2/main-batched.cpp +++ b/examples/gpt-2/main-batched.cpp @@ -237,30 +237,30 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + buffer_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g + buffer_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - buffer_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + buffer_size += ggml_row_size(wtype, n_vocab*n_embd); // wte + buffer_size += ggml_row_size(GGML_TYPE_F32, n_ctx*n_embd); // wpe + buffer_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head - buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g + buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g + buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b - buffer_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - buffer_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + buffer_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w + buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b - buffer_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + buffer_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w + buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b - buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - buffer_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + buffer_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w + buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + buffer_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w + buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b buffer_size += (6 + 12*n_layer)*128; // alignment overhead @@ -599,13 +599,6 @@ struct ggml_cgraph * gpt2_graph( } } - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(allocr, KQ_scale); - if (!ggml_allocr_is_measure(allocr)) { - float s = 1.0f/sqrtf(float(n_embd)/n_head); - ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s)); - } - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); ggml_set_name(KQ_mask, "KQ_mask"); @@ -720,7 +713,7 @@ struct ggml_cgraph * gpt2_graph( struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, - KQ_scale); + 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // [n_kv, n_tokens, 12] diff --git a/examples/gpt-2/main-ctx.cpp b/examples/gpt-2/main-ctx.cpp index 0eb71a573..2c075f38d 100644 --- a/examples/gpt-2/main-ctx.cpp +++ b/examples/gpt-2/main-ctx.cpp @@ -164,33 +164,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // wte + ctx_size += ggml_row_size(GGML_TYPE_F32, n_ctx*n_embd); // wpe + ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v ctx_size += (6 + 12*n_layer)*512; // object overhead @@ -531,11 +531,7 @@ bool gpt2_eval( // KQ_scaled = KQ / sqrt(n_embd/n_head) // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // [n_past + N, N, 12] diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 08a9aa3f8..a0109ffee 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -99,7 +99,6 @@ struct gpt2_model { // inputs/constants struct ggml_tensor * embd; struct ggml_tensor * position; - struct ggml_tensor * KQ_scale; }; void init_backends(gpt2_model & model, const gpt_params & params) { @@ -511,14 +510,12 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & { model.embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx); model.position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx); - model.KQ_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // FIXME: should be in backend_kv, but also shouldn't matter ggml_set_name(model.embd, "in/embd"); ggml_set_name(model.position, "in/position"); - ggml_set_name(model.KQ_scale, "KQ_scale"); // add input tensors to cpu backend - size_t input_size = ggml_nbytes(model.embd) + ggml_nbytes(model.position) + ggml_nbytes(model.KQ_scale); + size_t input_size = ggml_nbytes(model.embd) + ggml_nbytes(model.position); // FIXME: use cpu backend after sched impl ggml_backend_t backend_input = params.n_gpu_layers >= model.hparams.n_layer ? backend_gpu : backend_cpu; @@ -529,12 +526,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_input); ggml_allocr_alloc(alloc, model.embd); ggml_allocr_alloc(alloc, model.position); - ggml_allocr_alloc(alloc, model.KQ_scale); ggml_allocr_free(alloc); - - // initialize KQ_scale - float s = 1.0f/sqrtf(float(model.hparams.n_embd)/model.hparams.n_head); - ggml_backend_tensor_set(model.KQ_scale, &s, 0, sizeof(s)); } return true; @@ -588,7 +580,7 @@ struct ggml_cgraph * gpt2_graph( } //} - struct ggml_tensor * KQ_scale = model.KQ_scale; + const float KQ_scale = 1.0f/sqrtf(float(model.hparams.n_embd)/model.hparams.n_head); // wte + wpe struct ggml_tensor * inpL = @@ -697,10 +689,7 @@ struct ggml_cgraph * gpt2_graph( // KQ_scaled = KQ / sqrt(n_embd/n_head) // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - KQ_scale); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il); // KQ_masked = mask_past(KQ_scaled) diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp index fbfbd4715..f29c35727 100644 --- a/examples/gpt-j/main.cpp +++ b/examples/gpt-j/main.cpp @@ -166,31 +166,31 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte + ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g - ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b + ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g + ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w + ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_q_proj_w + ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_k_proj_w + ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_v_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v ctx_size += (5 + 10*n_layer)*512; // object overhead @@ -496,8 +496,7 @@ bool gptj_eval( struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); + 1.0f/sqrt(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp index 5fbe5148e..37f3c61b3 100644 --- a/examples/gpt-neox/main.cpp +++ b/examples/gpt-neox/main.cpp @@ -166,34 +166,34 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_ const size_t n_ctx = hparams.n_ctx; const size_t n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte + ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g - //ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b + ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g + //ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd*n_embd)); // c_attn_proj_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v ctx_size += (6 + 16*n_layer)*1024; // object overhead @@ -562,8 +562,7 @@ bool gpt_neox_eval( struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); + 1.0f/sqrt(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index a03b95341..358085861 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -65,11 +65,11 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) { const int n_hidden = hparams.n_hidden; const int n_classes = hparams.n_classes; - ctx_size += n_input * n_hidden * ggml_type_sizef(GGML_TYPE_F32); // fc1 weight - ctx_size += n_hidden * ggml_type_sizef(GGML_TYPE_F32); // fc1 bias + ctx_size += n_input * n_hidden * ggml_type_size(GGML_TYPE_F32); // fc1 weight + ctx_size += n_hidden * ggml_type_size(GGML_TYPE_F32); // fc1 bias - ctx_size += n_hidden * n_classes * ggml_type_sizef(GGML_TYPE_F32); // fc2 weight - ctx_size += n_classes * ggml_type_sizef(GGML_TYPE_F32); // fc2 bias + ctx_size += n_hidden * n_classes * ggml_type_size(GGML_TYPE_F32); // fc2 weight + ctx_size += n_classes * ggml_type_size(GGML_TYPE_F32); // fc2 bias printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp index b934e4994..a16367cc1 100644 --- a/examples/mpt/main.cpp +++ b/examples/mpt/main.cpp @@ -274,18 +274,21 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo const size_t n_layer = hparams.n_layers; const size_t n_vocab = hparams.n_vocab; - ctx_size += n_embd * n_vocab * ggml_type_sizef(wtype); // wte_weight - ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32); // norm_f_weight + ctx_size += ggml_row_size(wtype, n_embd * n_vocab); // wte_weight + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // norm_f_weight - ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_1_weight - ctx_size += n_layer * (3 * n_embd * n_embd * ggml_type_sizef(wtype)); // attn_Wqkv_weight - ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype)); // attn_out_proj_weight - ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_2_weight - ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight - ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight + ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_weight - ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k - ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v + ctx_size += n_layer * (ggml_row_size(wtype, 3 * n_embd * n_embd)); // attn_Wqkv_weight + ctx_size += n_layer * (ggml_row_size(wtype, n_embd * n_embd)); // attn_out_proj_weight + + ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_weight + + ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_up_weight + ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_down_weight + + ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k + ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v ctx_size += (1 + 6 * n_layer) * 512; // object overhead @@ -568,7 +571,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // KQ_scaled = KQ / sqrt(n_embd/n_head) struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head))); + ggml_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head)); struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, n_past, n_head, model.hparams.alibi_bias_max); diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp index f851470e9..acd1cbb5e 100644 --- a/examples/replit/main.cpp +++ b/examples/replit/main.cpp @@ -256,18 +256,21 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t const int n_ctx = hparams.max_seq_len; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd * n_vocab * ggml_type_sizef(wtype); // wte_weight - ctx_size += n_embd * ggml_type_sizef(GGML_TYPE_F32); // ln_f_weight + ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte_weight + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_weight - ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_1_weight - ctx_size += n_layer * (3 * n_embd * n_embd * ggml_type_sizef(wtype)); // attn_Wqkv_weight - ctx_size += n_layer * (n_embd * n_embd * ggml_type_sizef(wtype)); // attn_out_proj_weight - ctx_size += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_2_weight - ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight - ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight + ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_weight - ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k - ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v + ctx_size += n_layer * (ggml_row_size(wtype, 3 * n_embd * n_embd)); // attn_Wqkv_weight + ctx_size += n_layer * (ggml_row_size(wtype, n_embd * n_embd)); // attn_out_proj_weight + + ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_weight + + ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_up_weight + ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_down_weight + + ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k + ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v ctx_size += (1 + 6 * n_layer) * 512; // object overhead @@ -539,7 +542,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa // KQ_scaled = KQ / sqrt(n_embd/n_head) struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f / sqrt(float(n_embd) / n_head))); + ggml_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head)); struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8.0f); diff --git a/examples/sam/main.cpp b/examples/sam/main.cpp index 38d5e2734..d8dedf85f 100644 --- a/examples/sam/main.cpp +++ b/examples/sam/main.cpp @@ -568,56 +568,56 @@ bool sam_model_load(const sam_params & params, sam_model & model) { // image encoder { - ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_size(GGML_TYPE_F32); - ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_enc_state*ggml_type_size(GGML_TYPE_F32); - ctx_size += n_enc_state*n_enc_out_chans*1*1*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_sizef(GGML_TYPE_F16); + ctx_size += n_enc_state*n_enc_out_chans*1*1*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_size(GGML_TYPE_F16); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32); + ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32); + ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32); } // image encoder layers { - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32); + ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32); - ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_sizef(GGML_TYPE_F16); + ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_size(GGML_TYPE_F16); - ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_sizef(GGML_TYPE_F16); + ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_size(GGML_TYPE_F16); - ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*3*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_enc_layer*3*n_enc_state* ggml_type_size(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_enc_layer*n_enc_state* ggml_type_size(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_layer*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32); + ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32); - ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_enc_layer*4*n_enc_state* ggml_type_size(GGML_TYPE_F32); - ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_enc_layer*4*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_enc_layer*4*n_enc_state* ggml_type_size(GGML_TYPE_F32); } ctx_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead(); // prompt encoder { - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); // 2*(n_enc_out_chans/2) + ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F16); // 2*(n_enc_out_chans/2) - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32); + ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_size(GGML_TYPE_F32); } ctx_size += (2 + n_pt_embd)*ggml_tensor_overhead(); @@ -632,62 +632,62 @@ bool sam_model_load(const sam_params & params, sam_model & model) { const int n_hypernet_mpls_count = 4; // self_attn - ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*qkv_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16); + ctx_size += tfm_layers_count*qkv_count*n_enc_state* ggml_type_size(GGML_TYPE_F32); + ctx_size += tfm_layers_count*n_enc_state* ggml_type_size(GGML_TYPE_F32); // all norms - ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32); + ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32); // cross_attn_token_to_img - ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_size(GGML_TYPE_F16); + ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_size(GGML_TYPE_F32); + ctx_size += tfm_layers_count*n_enc_state* ggml_type_size(GGML_TYPE_F32); // mlp - ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*8*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16); + ctx_size += tfm_layers_count*8*n_enc_out_chans* ggml_type_size(GGML_TYPE_F32); + ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16); + ctx_size += tfm_layers_count*n_enc_out_chans* ggml_type_size(GGML_TYPE_F32); // cross_attn_img_to_token - ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += tfm_layers_count*n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_size(GGML_TYPE_F16); + ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)* ggml_type_size(GGML_TYPE_F32); + ctx_size += tfm_layers_count*n_enc_state* ggml_type_size(GGML_TYPE_F32); // transformer_final_attn_token_to_img - ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += qkv_count*(n_enc_state/2)* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_state* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_size(GGML_TYPE_F16); + ctx_size += qkv_count*(n_enc_state/2)* ggml_type_size(GGML_TYPE_F32); + ctx_size += n_enc_state* ggml_type_size(GGML_TYPE_F32); // transformer_norm_final - ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += norm_count*n_enc_state*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32); + ctx_size += norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32); // output_upscaling - ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += 3*n_img_embd* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += (n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_size(GGML_TYPE_F16); + ctx_size += 3*n_img_embd* ggml_type_size(GGML_TYPE_F32); + ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_size(GGML_TYPE_F16); + ctx_size += (n_img_embd/2)* ggml_type_size(GGML_TYPE_F32); // output_hypernetworks_mlps - ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_hypernet_mpls_count*(n_img_embd/2)* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans* ggml_type_size(GGML_TYPE_F32); + ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_hypernet_mpls_count*(n_img_embd/2)* ggml_type_size(GGML_TYPE_F32); // iou_prediction_head - ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += 2*n_enc_out_chans* ggml_type_sizef(GGML_TYPE_F32); - ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F16); - ctx_size += n_pt_embd* ggml_type_sizef(GGML_TYPE_F32); + ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16); + ctx_size += 2*n_enc_out_chans* ggml_type_size(GGML_TYPE_F32); + ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16); + ctx_size += n_pt_embd* ggml_type_size(GGML_TYPE_F32); // iou_token_w - ctx_size += n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32); // mask_tokens_w - ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_size(GGML_TYPE_F32); } } fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); @@ -1137,7 +1137,7 @@ struct ggml_tensor * sam_fill_dense_pe( struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), xy_embed_stacked); - cur = ggml_scale(ctx0, cur, ggml_new_f32(ctx0, float(2.0*M_PI))); + cur = ggml_scale(ctx0, cur, float(2.0*M_PI)); // concat // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192 @@ -1307,8 +1307,7 @@ struct ggml_cgraph * sam_encode_image( struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(n_enc_head_dim)) - ); + 1.0f/sqrtf(n_enc_head_dim)); struct ggml_tensor * rw = ggml_get_rel_pos(ctx0, layer.rel_pos_w, W, W); struct ggml_tensor * rh = ggml_get_rel_pos(ctx0, layer.rel_pos_h, H, H); @@ -1455,7 +1454,7 @@ prompt_encoder_result sam_encode_prompt( struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), inp); - cur = ggml_scale(ctx0, cur, ggml_new_f32(ctx0, float(2.0*M_PI))); + cur = ggml_scale(ctx0, cur, float(2.0*M_PI)); // concat // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192 @@ -1537,10 +1536,7 @@ struct ggml_tensor* sam_decode_mask_transformer_attn( // Q * K struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(Q->ne[0])))); + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(Q->ne[0]))); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled); diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp index 603473a36..b11cbb709 100644 --- a/examples/starcoder/main.cpp +++ b/examples/starcoder/main.cpp @@ -188,33 +188,33 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head const int kv_dim = kv_heads * head_dim; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g + ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte + ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe + ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b - ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w // TODO: - ctx_size += n_layer*( (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + ctx_size += n_layer*((n_embd + 2*kv_dim)*ggml_row_size(wtype, n_embd)); // c_attn_attn_w // TODO: + ctx_size += n_layer*((n_embd + 2*kv_dim)*ggml_row_size(GGML_TYPE_F32, 1)); // c_attn_attn_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w + ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k + ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v ctx_size += (6 + 12*n_layer)*512; // object overhead @@ -569,10 +569,7 @@ bool starcoder_eval( // KQ_scaled = KQ / sqrt(n_embd/n_head) // [n_past + N, N, 12] struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); + ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // [n_past + N, N, 12] diff --git a/examples/starcoder/starcoder-mmap.cpp b/examples/starcoder/starcoder-mmap.cpp index 8d2c72ddb..b1acb575e 100644 --- a/examples/starcoder/starcoder-mmap.cpp +++ b/examples/starcoder/starcoder-mmap.cpp @@ -782,8 +782,7 @@ bool starcoder_eval( struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); + 1.0f/sqrt(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // [n_past + N, N, 12] diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp index 594d6006d..d0eb59077 100644 --- a/examples/whisper/whisper.cpp +++ b/examples/whisper/whisper.cpp @@ -1777,7 +1777,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder( ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false); - ggml_allocr * alloc = wstate.alloc_encode.alloc; + //ggml_allocr * alloc = wstate.alloc_encode.alloc; //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state); //ggml_allocr_alloc(alloc, cur); @@ -1787,13 +1787,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder( //} struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv); - struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(alloc, KQscale); - - if (!ggml_allocr_is_measure(alloc)) { - const float val = 1.0f/sqrtf(float(n_state)/n_head); - ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float)); - } + const float KQscale = 1.0f/sqrtf(float(n_state)/n_head); // =================================================================== // NOTE: experimenting with partial evaluation of the encoder (ignore) @@ -1843,14 +1837,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder( Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b); - //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + //Qcur = ggml_scale(ctx0, Qcur, pow(float(n_state)/n_head, -0.25)); // note: no bias for Key struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.attn_k_w, cur); - //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + //Kcur = ggml_scale(ctx0, Kcur, pow(float(n_state)/n_head, -0.25)); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.attn_v_w, @@ -2032,7 +2026,7 @@ static struct ggml_cgraph * whisper_build_graph_cross( ggml_cgraph * gf = ggml_new_graph(ctx0); - ggml_allocr * alloc = wstate.alloc_cross.alloc; + //ggml_allocr * alloc = wstate.alloc_cross.alloc; //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); //ggml_allocr_alloc(alloc, cur); @@ -2042,13 +2036,7 @@ static struct ggml_cgraph * whisper_build_graph_cross( //} struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc); - struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(alloc, Kscale); - - if (!ggml_allocr_is_measure(alloc)) { - const float val = pow(float(n_state) / n_head, -0.25); - ggml_backend_tensor_set(Kscale, &val, 0, sizeof(float)); - } + const float Kscale = pow(float(n_state) / n_head, -0.25); for (int il = 0; il < model.hparams.n_text_layer; ++il) { auto & layer = model.layers_decoder[il]; @@ -2207,13 +2195,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder( } } - struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(alloc, KQscale); - - if (!ggml_allocr_is_measure(alloc)) { - const float val = pow(float(n_state)/n_head, -0.25); - ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float)); - } + const float KQscale = pow(float(n_state)/n_head, -0.25); struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); ggml_allocr_alloc(alloc, KQ_mask); diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 58d5ccae6..a9d2fddd7 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -21,6 +21,7 @@ extern "C" { GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); @@ -29,6 +30,8 @@ extern "C" { GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer); // @@ -76,6 +79,10 @@ extern "C" { GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); +#ifdef GGML_USE_CPU_HBM + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); +#endif + // // Backend registry // diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index 1447646b1..338f355a4 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -303,7 +303,7 @@ extern "C" { #if defined(__ARM_NEON) && defined(__CUDACC__) typedef half ggml_fp16_t; -#elif defined(__ARM_NEON) +#elif defined(__ARM_NEON) && !defined(_MSC_VER) typedef __fp16 ggml_fp16_t; #else typedef uint16_t ggml_fp16_t; @@ -343,6 +343,12 @@ extern "C" { GGML_TYPE_COUNT, }; + // precision + enum ggml_prec { + GGML_PREC_DEFAULT, + GGML_PREC_F32, + }; + enum ggml_backend_type { GGML_BACKEND_CPU = 0, GGML_BACKEND_GPU = 10, @@ -478,7 +484,8 @@ extern "C" { enum ggml_log_level { GGML_LOG_LEVEL_ERROR = 2, GGML_LOG_LEVEL_WARN = 3, - GGML_LOG_LEVEL_INFO = 4 + GGML_LOG_LEVEL_INFO = 4, + GGML_LOG_LEVEL_DEBUG = 5 }; // ggml object @@ -502,7 +509,6 @@ extern "C" { struct ggml_backend_buffer * buffer; - int n_dims; int64_t ne[GGML_MAX_DIMS]; // number of elements size_t nb[GGML_MAX_DIMS]; // stride in bytes: // nb[0] = ggml_type_size(type) @@ -534,7 +540,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[12]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -639,11 +645,14 @@ extern "C" { GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN - GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); - GGML_API int ggml_blck_size (enum ggml_type type); - GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block - GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float + GGML_API int ggml_blck_size(enum ggml_type type); + GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + + GGML_DEPRECATED( + GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float + "use ggml_row_size() instead"); GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API const char * ggml_op_name (enum ggml_op op); @@ -662,6 +671,11 @@ extern "C" { GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); + GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -722,8 +736,8 @@ extern "C" { GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); // Context tensor enumeration and lookup - GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx); - GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx); + GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); @@ -1050,6 +1064,12 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // change the precision of a matrix multiplication + // set to GGML_PREC_F32 for higher precision (useful for phi-2) + GGML_API void ggml_mul_mat_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec); + // indirect matrix multiplication // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) GGML_API struct ggml_tensor * ggml_mul_mat_id( @@ -1075,13 +1095,13 @@ extern "C" { GGML_API struct ggml_tensor * ggml_scale( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + float s); // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_scale_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + float s); // b -> view(a,offset,nb1,nb2,3), return modified a GGML_API struct ggml_tensor * ggml_set( @@ -2116,10 +2136,11 @@ extern "C" { GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); - GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); - GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); - GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); - GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); + GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i); // overrides existing values or adds a new one GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index d3049efb4..a97436b17 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd if (update_backend) { view->backend = view->view_src->backend; } - view->buffer = view->view_src->buffer; + // views are initialized in the alloc buffer rather than the view_src buffer + view->buffer = alloc->buffer; view->data = (char *)view->view_src->data + view->view_offs; - // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend - // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); if (!alloc->measure) { @@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) { } void ggml_allocr_free(ggml_allocr_t alloc) { + if (alloc == NULL) { + return; + } + ggml_gallocr_free(alloc->galloc); ggml_tallocr_free(alloc->talloc); free(alloc); @@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (nbytes == 0) { - fprintf(stderr, "%s: no tensors to allocate\n", __func__); + // all the tensors in the context are already allocated return NULL; } @@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } else { ggml_backend_view_init(buffer, t); } + } else { + if (t->view_src != NULL) { + // view of a pre-allocated tensor + ggml_backend_view_init(buffer, t); + } } } diff --git a/src/ggml-backend-impl.h b/src/ggml-backend-impl.h index f588af602..05859935a 100644 --- a/src/ggml-backend-impl.h +++ b/src/ggml-backend-impl.h @@ -20,6 +20,9 @@ extern "C" { size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + // check if tensor data is in host memory + // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) + bool (*is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { @@ -31,15 +34,16 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - void (*free_buffer)(ggml_backend_buffer_t buffer); + void (*free_buffer) (ggml_backend_buffer_t buffer); //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - void * (*get_base) (ggml_backend_buffer_t buffer); - void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void * (*get_base) (ggml_backend_buffer_t buffer); + void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); }; struct ggml_backend_buffer { @@ -78,7 +82,7 @@ extern "C" { void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*synchronize) (ggml_backend_t backend); + void (*synchronize)(ggml_backend_t backend); // compute graph with a plan ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); diff --git a/src/ggml-backend.c b/src/ggml-backend.c index 3a22cd085..0c8c9ec43 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba return buft->iface.supports_backend(buft, backend); } +bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { + if (buft->iface.is_host) { + return buft->iface.is_host(buft); + } + return false; +} + // backend buffer ggml_backend_buffer_t ggml_backend_buffer_init( @@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor); } +void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + buffer->iface.clear(buffer, value); +} + +bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { + return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer)); +} + ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { return buffer->buft; } @@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); - GGML_UNUSED(buffer); } static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, GGML_UNUSED(buffer); } +static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + memset(buffer->context, value, buffer->size); +} + static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, @@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .clear = */ ggml_backend_cpu_buffer_clear, }; // for buffers from ptr, free is not called @@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .clear = */ ggml_backend_cpu_buffer_clear, }; static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 @@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty GGML_UNUSED(buft); } +static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + GGML_UNUSED(buft); +} + ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, }; - return &ggml_backend_buffer_type_cpu; + return &ggml_backend_cpu_buffer_type; } +#ifdef GGML_USE_CPU_HBM + +// buffer type HBM + +#include + +static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { + hbw_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + //void * ptr = hbw_malloc(size); + void * ptr; + int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); + if (result != 0) { + fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size); + return NULL; + } + + // FIXME: this is a hack to avoid having to implement a new buffer type + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_buffer_type_hbm; +} +#endif + struct ggml_backend_cpu_context { int n_threads; void * work_data; @@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); - cpu_plan->cgraph = *cgraph; + cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); @@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml // utils void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); - GGML_ASSERT(tensor->data == NULL); + //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 06da7b307..11d4da845 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -31,6 +31,7 @@ #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_32F HIPBLAS_R_32F #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx #define cublasGemmBatchedEx hipblasGemmBatchedEx @@ -40,6 +41,7 @@ #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess @@ -58,8 +60,13 @@ #define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError +#ifdef GGML_HIP_UMA +#define cudaMalloc hipMallocManaged +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) +#else #define cudaMalloc hipMalloc #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#endif #define cudaMemcpy hipMemcpy #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync @@ -78,6 +85,7 @@ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) #define cudaStream_t hipStream_t #define cudaSuccess hipSuccess +#define __trap abort #else #include #include @@ -510,6 +518,14 @@ static size_t g_scratch_offset = 0; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; +[[noreturn]] +static __device__ void bad_arch() { + printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n"); + __trap(); + + (void) bad_arch; // suppress unused function warning +} + static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { @@ -1970,8 +1986,7 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp // second part effectively subtracts 8 from each quant value return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2008,8 +2023,7 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2044,8 +2058,7 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp // second part effectively subtracts 16 from each quant value return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2090,8 +2103,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp return sumi*d5d8 + m5s8 / (QI5_1 / vdr); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2112,8 +2124,7 @@ template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp return d8_0*d8_1 * sumi; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2143,8 +2154,7 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2179,8 +2189,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( return dm2f.x*sumf_d - dm2f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2217,8 +2226,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2258,8 +2266,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( return d3 * sumf; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2284,8 +2291,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( return d3*d8 * sumi; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2318,8 +2324,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2352,8 +2357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2393,8 +2397,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( return dm5f.x*sumf_d - dm5f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2427,8 +2430,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2458,8 +2460,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( return d*sumf; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2490,8 +2491,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( return d6 * sumf_d; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -3357,8 +3357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( return dall * sumf_d - dmin * sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif @@ -3541,8 +3540,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( return d * sumf_d; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif @@ -3952,7 +3950,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_0_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4021,7 +4019,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_1_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4088,7 +4086,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_0_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4155,7 +4153,7 @@ mul_mat_q5_1( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_1_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4222,7 +4220,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q8_0_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4289,7 +4287,7 @@ mul_mat_q2_K( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q2_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4358,7 +4356,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q3_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4427,7 +4425,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4494,7 +4492,7 @@ mul_mat_q5_K( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4563,7 +4561,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q6_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4998,7 +4996,16 @@ static __global__ void rope_neox( const int ib = col / n_dims; const int ic = col % n_dims; - const int i = row*ncols + ib*n_dims + ic/2; + if (ib > 0) { + const int i = row*ncols + ib*n_dims + ic; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ncols + ib*n_dims + ic/2; const int i2 = row/p_delta_rows; float cur_rot = inv_ndims * ic - ib; @@ -6814,6 +6821,7 @@ static void ggml_cuda_op_get_rows( break; default: // TODO: k-quants + fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); GGML_ASSERT(false); break; } @@ -7378,7 +7386,7 @@ inline void ggml_cuda_op_mul_mat_cublas( const int compute_capability = g_compute_capabilities[id]; - if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { + if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 half * src0_as_f16 = nullptr; size_t src0_as = 0; @@ -7692,17 +7700,9 @@ inline void ggml_cuda_op_scale( const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - float scale; - // HACK: support for ggml backend interface - if (src1->backend == GGML_BACKEND_CPU) { - scale = ((float *) src1->data)[0]; - } else { - // TODO: pass pointer to kernel instead of copying to host - CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost)); - } + const float scale = ((float *) dst->op_params)[0]; scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); @@ -7749,8 +7749,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; - const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE; - // dd = data device float * src0_ddf = nullptr; float * src1_ddf = nullptr; @@ -7771,7 +7769,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); } - if (use_src1 && !src1_stays_on_host) { + if (use_src1) { if (src1_on_device) { src1_ddf = (float *) src1_extra->data_device[g_main_device]; } else { @@ -7819,6 +7817,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { } #ifdef NDEBUG + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(cudaDeviceSynchronize()); + } + for (int id = 0; id < g_device_count; ++id) { CUDA_CHECK(ggml_cuda_set_device(id)); @@ -7870,8 +7873,6 @@ static void ggml_cuda_op_mul_mat( const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - ggml_cuda_set_peer_access(ne11); - GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); @@ -8302,27 +8303,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor } static __global__ void k_compute_batched_ptrs( - const half * src0_as_f16, const half * src1_as_f16, half * dst_f16, + const half * src0_as_f16, const half * src1_as_f16, char * dst, const void ** ptrs_src, void ** ptrs_dst, - int ne12, int ne13, - int ne23, - int nb02, int nb03, - int nb12, int nb13, - int nb2, int nb3, - int r2, int r3) { - int i13 = blockIdx.x * blockDim.x + threadIdx.x; - int i12 = blockIdx.y * blockDim.y + threadIdx.y; + int64_t ne12, int64_t ne13, + int64_t ne23, + size_t nb02, size_t nb03, + size_t nb12, size_t nb13, + size_t nbd2, size_t nbd3, + int64_t r2, int64_t r3) { + int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; + int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; if (i13 >= ne13 || i12 >= ne12) { return; } - int i03 = i13 / r3; - int i02 = i12 / r2; + int64_t i03 = i13 / r3; + int64_t i02 = i12 / r2; ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; - ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -8378,7 +8379,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); size_t dst_as = 0; - half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + + half * dst_f16 = nullptr; + char * dst_t = nullptr; + + cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; + cudaDataType_t cu_data_type = CUDA_R_16F; + + // dst strides + size_t nbd2 = dst->nb[2]; + size_t nbd3 = dst->nb[3]; + + const half alpha_f16 = 1.0f; + const half beta_f16 = 0.0f; + + const float alpha_f32 = 1.0f; + const float beta_f32 = 0.0f; + + const void * alpha = &alpha_f16; + const void * beta = &beta_f16; + + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + dst_t = (char *) dst_f16; + + nbd2 /= sizeof(float) / sizeof(half); + nbd3 /= sizeof(float) / sizeof(half); + } else { + dst_t = (char *) dst_ddf; + + cu_compute_type = CUBLAS_COMPUTE_32F; + cu_data_type = CUDA_R_32F; + + alpha = &alpha_f32; + beta = &beta_f32; + } GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne13 % ne03 == 0); @@ -8387,9 +8422,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const const int64_t r2 = ne12/ne02; const int64_t r3 = ne13/ne03; - const half alpha_f16 = 1.0f; - const half beta_f16 = 0.0f; - #if 0 // use cublasGemmEx { @@ -8399,12 +8431,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const int i02 = i12 / r2; CUBLAS_CHECK( - cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), - (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), - &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01, - CUBLAS_COMPUTE_16F, + alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), + (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), + beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } @@ -8416,11 +8448,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const CUBLAS_CHECK( cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA - (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB - &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC + alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA + (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB + beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC ne12*ne13, - CUBLAS_COMPUTE_16F, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { // use cublasGemmBatchedEx @@ -8437,24 +8469,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const dim3 block_dims(ne13, ne12); k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( - src0_as_f16, src1_as_f16, dst_f16, + src0_as_f16, src1_as_f16, dst_t, ptrs_src, ptrs_dst, ne12, ne13, ne23, nb02, nb03, nb12, nb13, - dst->nb[2], dst->nb[3], + nbd2, nbd3, r2, r3); CUDA_CHECK(cudaGetLastError()); CUBLAS_CHECK( cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half), - (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float), - &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01, + alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half), + (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float), + beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01, ne23, - CUBLAS_COMPUTE_16F, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); if (ptrs_src_s != 0) { @@ -8466,11 +8498,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const } #endif - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); - to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + + ggml_cuda_pool_free(dst_f16, dst_as); + } ggml_cuda_pool_free(src1_as_f16, src1_as); - ggml_cuda_pool_free(dst_f16, dst_as); } static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -8734,7 +8769,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s // TODO: mmq/mmv support #endif - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + const int64_t nb11 = src1->nb[1]; + const int64_t nb1 = dst->nb[1]; const struct ggml_tensor * ids = src0; const int32_t id = ((int32_t *) dst->op_params)[0]; @@ -8742,10 +8778,12 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s std::vector ids_host(ggml_nbytes(ids)); + const cudaStream_t stream = g_cudaStreams[g_main_device][0]; + if (ids->backend == GGML_BACKEND_GPU) { const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } else { memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); } @@ -8759,37 +8797,110 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.ne[1] = 1; - dst_row.ne[1] = 1; - - src1_row.nb[2] = src1_row.nb[1]; - dst_row.nb[2] = dst_row.nb[1]; - - src1_row.nb[3] = src1_row.nb[1]; - dst_row.nb[3] = dst_row.nb[1]; + src1_row.backend = GGML_BACKEND_GPU; + dst_row.backend = GGML_BACKEND_GPU; src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; + char * src1_original = src1->backend == GGML_BACKEND_CPU ? + (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; + char * dst_original = dst->backend == GGML_BACKEND_CPU ? + (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; + + if (src1->ne[1] == 1) { + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + //int32_t row_id; + //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + + const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + + src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1]; + src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set? - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - //int32_t row_id; - //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1]; + dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set? - const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + } + } else { + size_t as_src1, as_dst; + char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1); + char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst); + + src1_row_extra.data_device[g_main_device] = src1_contiguous; + dst_row_extra.data_device[g_main_device] = dst_contiguous; + + const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ? + cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; + const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ? + cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; - GGML_ASSERT(row_id >= 0 && row_id < n_as); + for (int32_t row_id = 0; row_id < n_as; ++row_id) { + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + + int64_t num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11, + nb11, src1_kind, stream)); + num_src1_rows++; + } + + if (num_src1_rows == 0) { + continue; + } + + src1_row.ne[1] = num_src1_rows; + dst_row.ne[1] = num_src1_rows; + + src1_row.nb[1] = nb11; + src1_row.nb[2] = num_src1_rows*nb11; + src1_row.nb[3] = num_src1_rows*nb11; + + dst_row.nb[1] = nb1; + dst_row.nb[2] = num_src1_rows*nb1; + dst_row.nb[3] = num_src1_rows*nb1; + + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + + num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + GGML_ASSERT(row_id >= 0 && row_id < n_as); - src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1]; - src1_row.data = (char *) src1->data + i01*src1->nb[1]; + CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1, + nb1, dst_kind, stream)); + num_src1_rows++; + } + } - dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1]; - dst_row.data = (char *) dst->data + i01*dst->nb[1]; + ggml_cuda_pool_free(src1_contiguous, as_src1); + ggml_cuda_pool_free(dst_contiguous, as_dst); + } - ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(cudaStreamSynchronize(stream)); } } @@ -8900,6 +9011,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg (void) dst; } +static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); +} + void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { const int64_t nrows = ggml_nrows(tensor); @@ -8949,8 +9066,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses if (ne0 % MATRIX_ROW_PADDING != 0) { - size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) - * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } char * buf; @@ -8977,7 +9093,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { } void ggml_cuda_free_data(struct ggml_tensor * tensor) { - if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { + if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { return; } @@ -9100,11 +9216,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); - const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || - tensor->op == GGML_OP_VIEW; + const bool inplace = tensor->view_src != nullptr; - if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t view_offset = 0; if (tensor->op == GGML_OP_VIEW) { @@ -9184,14 +9299,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); - if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) { + if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { return false; } if (tensor->op == GGML_OP_MUL_MAT) { if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { #ifndef NDEBUG - fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); + fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); #endif return false; } @@ -9320,6 +9435,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ return false; } + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); + } + if (params->ith != 0) { return true; } @@ -9393,7 +9512,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { - assert(tensor->view_src->buffer->buft == buffer->buft); // TODO + assert(tensor->view_src->buffer->buft == buffer->buft); tensor->backend = tensor->view_src->backend; tensor->extra = tensor->view_src->extra; return; @@ -9424,8 +9543,6 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g } static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -9437,8 +9554,6 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -9449,6 +9564,15 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); } +static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size)); +} + static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, @@ -9457,6 +9581,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, /* .cpy_tensor_from = */ NULL, /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_cuda_buffer_clear, }; // cuda buffer type @@ -9493,8 +9618,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t if (ggml_is_quantized(tensor->type)) { if (ne0 % MATRIX_ROW_PADDING != 0) { - size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) - * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } } @@ -9509,35 +9633,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t UNUSED(buft); } -static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = { +static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, + /* .is_host = */ nullptr, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES]; - static bool ggml_backend_buffer_type_cuda_initialized = false; - if (!ggml_backend_buffer_type_cuda_initialized) { + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + + static bool ggml_backend_cuda_buffer_type_initialized = false; + + if (!ggml_backend_cuda_buffer_type_initialized) { for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { - ggml_backend_buffer_type_cuda[i] = { - /* .iface = */ cuda_backend_buffer_type_interface, + ggml_backend_cuda_buffer_types[i] = { + /* .iface = */ ggml_backend_cuda_buffer_type_interface, /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, }; } - ggml_backend_buffer_type_cuda_initialized = true; + ggml_backend_cuda_buffer_type_initialized = true; } - return &ggml_backend_buffer_type_cuda[device]; + return &ggml_backend_cuda_buffer_types[device]; } // host buffer type static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; - CUDA_CHECK(cudaFreeHost(ctx->dev_ptr)); - delete ctx; + CUDA_CHECK(cudaFreeHost(buffer->context)); } static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -9550,24 +9675,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; return buffer; - - UNUSED(buft); } -struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = { - /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, -}; - ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = { - /* .iface = */ cuda_backend_host_buffer_type_interface, + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, /* .context = */ nullptr, }; - return &ggml_backend_buffer_type_cuda_host; + return &ggml_backend_cuda_buffer_type_host; } // backend @@ -9599,8 +9721,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); @@ -9610,8 +9730,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); diff --git a/src/ggml-metal.h b/src/ggml-metal.h index bf52d9cd3..b5e02b668 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); +GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); + GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); + GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 465679a6b..51a72ae33 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -180,7 +180,15 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end -ggml_log_callback ggml_metal_log_callback = NULL; + +static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { + fprintf(stderr, "%s", msg); + + UNUSED(level); + UNUSED(user_data); +} + +ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback; void * ggml_metal_log_user_data = NULL; void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { @@ -607,12 +615,24 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { } // temporarily defined here for compatibility between ggml-backend and the old API -struct ggml_backend_metal_buffer_context { - void * data; + +struct ggml_backend_metal_buffer { + void * data; + size_t size; id metal; }; +struct ggml_backend_metal_buffer_context { + void * all_data; + size_t all_size; + bool owned; + + // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap + int n_buffers; + struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; +}; + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -622,17 +642,29 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { const int64_t tsize = ggml_nbytes(t); + ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; + // compatibility with ggml-backend - if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) { - struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context; + if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) { + struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context; + + // find the view that contains the tensor fully + for (int i = 0; i < buf_ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data; + //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { + *offs = (size_t) ioffs; - GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size); + //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); + + return buf_ctx->buffers[i].metal; + } + } - *offs = (size_t) ioffs; + GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); - return buf_ctx->metal; + return nil; } // find the view that contains the tensor fully @@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute( { GGML_ASSERT(ggml_is_contiguous(src0)); - const float scale = *(const float *) src1->data; + const float scale = *(const float *) dst->op_params; int64_t n = ggml_nelements(dst); @@ -1272,8 +1304,8 @@ void ggml_metal_graph_compute( [encoder setComputePipelineState:ctx->pipeline_scale]; } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; @@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute( // backend interface +// default buffer static id g_backend_device = nil; static int g_backend_device_ref_count = 0; @@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) { static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - return ctx->data; + return ctx->all_data; } static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - [ctx->metal release]; + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->buffers[i].metal release]; + } ggml_backend_metal_free_device(); - free(ctx->data); - free(ctx); + if (ctx->owned) { + free(ctx->all_data); + } - UNUSED(buffer); + free(ctx); } static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); @@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer UNUSED(buffer); } -static struct ggml_backend_buffer_i metal_backend_buffer_i = { +static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + memset(ctx->all_data, value, ctx->all_size); +} + +static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, @@ -2441,8 +2477,11 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor, /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, + /* .clear = */ ggml_backend_metal_buffer_clear, }; +// default buffer type + static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); @@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba size_aligned += (size_page - (size_aligned % size_page)); } - ctx->data = ggml_metal_host_malloc(size); - ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data + id device = ggml_backend_metal_get_device(); + + ctx->all_data = ggml_metal_host_malloc(size_aligned); + ctx->all_size = size_aligned; + ctx->owned = true; + ctx->n_buffers = 1; + + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size); + if (ctx->buffers[0].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + free(ctx); + ggml_backend_metal_free_device(); + return NULL; + } + + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); + + +#if TARGET_OS_OSX + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + GGML_METAL_LOG_INFO("\n"); + } +#else + GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); +#endif + + + return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); - GGML_UNUSED(buft); + UNUSED(buft); +} + +static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + UNUSED(buft); } ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { @@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_metal_buffer_type_is_host, }, /* .context = */ NULL, }; @@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { return &ggml_backend_buffer_type_metal; } +// buffer from ptr + +ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { + struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); + + ctx->all_data = data; + ctx->all_size = size; + ctx->owned = false; + ctx->n_buffers = 0; + + const size_t size_page = sysconf(_SC_PAGESIZE); + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + id device = ggml_backend_metal_get_device(); + + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= device.maxBufferLength) { + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + return false; + } + + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into + // one of the views + const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case + const size_t size_step = device.maxBufferLength - size_ovlp; + const size_t size_view = device.maxBufferLength; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + return false; + } + + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i); + if (i + size_step < size) { + GGML_METAL_LOG_INFO("\n"); + } + + ++ctx->n_buffers; + } + } + +#if TARGET_OS_OSX + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + GGML_METAL_LOG_INFO("\n"); + } +#else + GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); +#endif + + return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); +} + +// backend + static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; @@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { free(backend); } -static void ggml_backend_metal_synchronize(ggml_backend_t backend) { - UNUSED(backend); -} - static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); @@ -2529,25 +2685,15 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct /* .get_tensor_async = */ NULL, /* .cpy_tensor_from_async = */ NULL, /* .cpy_tensor_to_async = */ NULL, - /* .synchronize = */ ggml_backend_metal_synchronize, - /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, }; -// TODO: make a common log callback for all backends in ggml-backend -static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { - fprintf(stderr, "%s", msg); - - UNUSED(level); - UNUSED(user_data); -} - ggml_backend_t ggml_backend_metal_init(void) { - ggml_metal_log_set_callback(ggml_backend_log_callback, NULL); - struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); if (ctx == NULL) { diff --git a/src/ggml-metal.metal b/src/ggml-metal.metal index fe0ada445..d5b54e112 100644 --- a/src/ggml-metal.metal +++ b/src/ggml-metal.metal @@ -1702,8 +1702,9 @@ kernel void kernel_rope( dst_data[1] = x0*sin_theta + x1*cos_theta; } } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) { + if (ic < n_dims) { + const int64_t ib = 0; // simplified from `(ib * n_dims + ic) * inv_ndims` const float cur_rot = inv_ndims*ic - ib; @@ -1722,6 +1723,14 @@ kernel void kernel_rope( dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + const int64_t i0 = ic; + + device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } diff --git a/src/ggml.c b/src/ggml.c index 29e18a24c..f27920a2d 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -1997,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } -size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type); -} - int ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; } @@ -2011,8 +2005,13 @@ size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } -float ggml_type_sizef(enum ggml_type type) { - return ((float)(type_traits[type].type_size))/type_traits[type].blck_size; +size_t ggml_row_size(enum ggml_type type, int64_t ne) { + assert(ne % ggml_blck_size(type) == 0); + return ggml_type_size(type)*ne/ggml_blck_size(type); +} + +double ggml_type_sizef(enum ggml_type type) { + return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } const char * ggml_type_name(enum ggml_type type) { @@ -2049,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) { return ggml_type_size(tensor->type); } -static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { +bool ggml_is_scalar(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool ggml_is_vector(const struct ggml_tensor * tensor) { +bool ggml_is_vector(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) { +bool ggml_is_matrix(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[2] == 1 && tensor->ne[3] == 1; } +bool ggml_is_3d(const struct ggml_tensor * tensor) { + return tensor->ne[3] == 1; +} + +int ggml_n_dims(const struct ggml_tensor * tensor) { + for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) { + if (tensor->ne[i] > 1) { + return i + 1; + } + } + return 1; +} + static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -2371,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) { size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { size_t max_size = 0; - struct ggml_object * obj = ctx->objects_begin; - - while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { - struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); - - const size_t size = ggml_nbytes(tensor); - - if (max_size < size) { - max_size = size; - } - } - - obj = obj->next; + for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) { + max_size = MAX(max_size, ggml_nbytes(tensor)); } return max_size; @@ -2473,7 +2473,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( view_src = view_src->view_src; } - size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type)); + size_t data_size = ggml_row_size(type, ne[0]); for (int i = 1; i < n_dims; i++) { data_size *= ne[i]; } @@ -2516,7 +2516,6 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.type =*/ type, /*.backend =*/ GGML_BACKEND_CPU, /*.buffer =*/ NULL, - /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, @@ -2623,7 +2622,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { } struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { - return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne); + return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne); } static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { @@ -3072,7 +3071,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, struct ggml_tensor * src) { - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0); ggml_format_name(result, "%s (view)", src->name); for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -3082,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor( return result; } -struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) { +struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) { struct ggml_object * obj = ctx->objects_begin; char * const mem_buffer = ctx->mem_buffer; @@ -3098,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) { return NULL; } -struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) { +struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) { struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE); obj = obj->next; @@ -3230,10 +3229,10 @@ static struct ggml_tensor * ggml_add_cast_impl( is_node = true; } - struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); result->op = GGML_OP_ADD; - result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL; + result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL; result->src[0] = a; result->src[1] = b; @@ -3602,12 +3601,12 @@ struct ggml_tensor * ggml_sum_rows( is_node = true; } - int64_t ne[4] = {1,1,1,1}; - for (int i=1; in_dims; ++i) { + int64_t ne[GGML_MAX_DIMS] = { 1 }; + for (int i = 1; i < GGML_MAX_DIMS; ++i) { ne[i] = a->ne[i]; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne); result->op = GGML_OP_SUM_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3628,8 +3627,8 @@ struct ggml_tensor * ggml_mean( is_node = true; } - int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne); + int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_MEAN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3651,8 +3650,7 @@ struct ggml_tensor * ggml_argmax( is_node = true; } - int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne); + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]); result->op = GGML_OP_ARGMAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3675,7 +3673,7 @@ struct ggml_tensor * ggml_repeat( is_node = true; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); result->op = GGML_OP_REPEAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3702,7 +3700,7 @@ struct ggml_tensor * ggml_repeat_back( return a; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); result->op = GGML_OP_REPEAT_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4078,7 +4076,7 @@ struct ggml_tensor * ggml_mul_mat( } const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_MUL_MAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4088,6 +4086,14 @@ struct ggml_tensor * ggml_mul_mat( return result; } +void ggml_mul_mat_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec) { + const int32_t prec_i32 = (int32_t) prec; + + ggml_set_op_params_i32(a, 0, prec_i32); +} + // ggml_mul_mat_id struct ggml_tensor * ggml_mul_mat_id( @@ -4112,7 +4118,7 @@ struct ggml_tensor * ggml_mul_mat_id( } const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); ggml_set_op_params_i32(result, 0, id); ggml_set_op_params_i32(result, 1, n_as); @@ -4150,7 +4156,7 @@ struct ggml_tensor * ggml_out_prod( // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3] const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_OUT_PROD; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4165,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod( static struct ggml_tensor * ggml_scale_impl( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b, + float s, bool inplace) { - GGML_ASSERT(ggml_is_scalar(b)); GGML_ASSERT(ggml_is_padded_1d(a)); bool is_node = false; - if (a->grad || b->grad) { + if (a->grad) { is_node = true; } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_set_op_params(result, &s, sizeof(s)); + result->op = GGML_OP_SCALE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -4189,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl( struct ggml_tensor * ggml_scale( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_scale_impl(ctx, a, b, false); + float s) { + return ggml_scale_impl(ctx, a, s, false); } struct ggml_tensor * ggml_scale_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_scale_impl(ctx, a, b, true); + float s) { + return ggml_scale_impl(ctx, a, s, true); } // ggml_set @@ -4435,7 +4441,7 @@ struct ggml_tensor * ggml_reshape( //GGML_ASSERT(false); } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -4813,7 +4819,7 @@ struct ggml_tensor * ggml_diag( } const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne); result->op = GGML_OP_DIAG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5460,7 +5466,7 @@ struct ggml_tensor * ggml_pool_1d( is_node = true; } - const int64_t ne[3] = { + const int64_t ne[2] = { ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), a->ne[1], }; @@ -5579,7 +5585,7 @@ struct ggml_tensor * ggml_argsort( enum ggml_sort_order order) { bool is_node = false; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); @@ -5626,7 +5632,7 @@ struct ggml_tensor * ggml_flash_attn( } //struct ggml_tensor * result = ggml_dup_tensor(ctx, q); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne); int32_t t = masked ? 1 : 0; ggml_set_op_params(result, &t, sizeof(t)); @@ -5659,7 +5665,7 @@ struct ggml_tensor * ggml_flash_ff( } //struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne); result->op = GGML_OP_FLASH_FF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5775,7 +5781,6 @@ struct ggml_tensor * ggml_win_part( const int np = npx*npy; const int64_t ne[4] = { a->ne[0], w, w, np, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { npx, npy, w }; @@ -7759,10 +7764,10 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; -// TODO: OpenCL kernel support broadcast #ifdef GGML_USE_CLBLAST if (src1->backend == GGML_BACKEND_GPU) { - GGML_ASSERT(ggml_are_same_shape(src0, src1)); + // TODO: OpenCL kernel support full broadcast + GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); if (ith == 0) { ggml_cl_mul(src0, src1, dst); } @@ -9159,6 +9164,8 @@ static void ggml_compute_forward_norm_f32( float eps; memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps > 0.0f); + // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -9228,6 +9235,8 @@ static void ggml_compute_forward_rms_norm_f32( float eps; memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps > 0.0f); + // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -9571,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas( } #endif -// off1 = offset in i11 and i1 -// cne1 = ne11 and ne1 -// in a normal matrix multiplication, off1 = 0 and cne1 = ne1 -// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1 static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst, - int64_t off1, int64_t cne1) { + struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -9648,9 +9652,9 @@ static void ggml_compute_forward_mul_mat( const int64_t i03 = i13/r3; const int64_t i02 = i12/r2; - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13); - float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3); + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); if (type != GGML_TYPE_F32) { float * const wdata = params->wdata; @@ -9667,7 +9671,7 @@ static void ggml_compute_forward_mul_mat( } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - cne1, ne01, ne10, + ne1, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); @@ -9683,7 +9687,7 @@ static void ggml_compute_forward_mul_mat( if (params->type == GGML_TASK_INIT) { if (src1->type != vec_dot_type) { char * wdata = params->wdata; - const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + const size_t row_size = ggml_row_size(vec_dot_type, ne10); assert(params->wsize >= ne11*ne12*ne13*row_size); assert(src1->type == GGML_TYPE_F32); @@ -9706,10 +9710,10 @@ static void ggml_compute_forward_mul_mat( } const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + const size_t row_size = ggml_row_size(vec_dot_type, ne10); - const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = cne1*ne12*ne13; // src1 rows + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne1*ne12*ne13; // src1 rows //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); @@ -9751,9 +9755,9 @@ static void ggml_compute_forward_mul_mat( for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { - const int64_t i13 = (ir1/(ne12*cne1)); - const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; - const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1; + const int64_t i13 = (ir1/(ne12*ne1)); + const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1; + const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1); // broadcast src0 into src1 const int64_t i03 = i13/r3; @@ -9793,28 +9797,191 @@ static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, + const struct ggml_tensor * ids, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { - // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type - ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]); - return; - } + const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - const struct ggml_tensor * ids = src0; + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // row groups const int id = ggml_get_op_params_i32(dst, 0); const int n_as = ggml_get_op_params_i32(dst, 1); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); + char * wdata_src1_end = (src1->type == vec_dot_type) ? + (char *) params->wdata : + (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); + + int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] + int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11] + + #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + + if (params->type == GGML_TASK_INIT) { + char * wdata = params->wdata; + if (src1->type != vec_dot_type) { + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + assert(params->wsize >= ne11*ne12*ne13*row_size); + assert(src1->type == GGML_TYPE_F32); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + + // initialize matrix_row_counts + GGML_ASSERT(wdata == wdata_src1_end); + memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); + + // group rows by src0 matrix + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); - GGML_ASSERT(row_id >= 0 && row_id < n_as); + GGML_ASSERT(row_id >= 0 && row_id < n_as); + MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01; + matrix_row_counts[row_id] += 1; + } - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1); + return; } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // compute each matrix multiplication in sequence + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + + if (cne1 == 0) { + continue; + } + + const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = cne1*ne12*ne13; // src1 rows + + //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); + + // distribute the thread work across the inner or outer loop based on which one is larger + + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; + + const int64_t dr0 = (nr0 + nth0 - 1)/nth0; + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + + const int64_t ir010 = dr0*ith0; + const int64_t ir011 = MIN(ir010 + dr0, nr0); + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); + + // threads with no work simply yield (not sure if it helps) + if (ir010 >= ir011 || ir110 >= ir111) { + sched_yield(); + continue; + } + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix + const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; + const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1); + const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11); + + // broadcast src0 into src1 + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); + + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } + } + } + } + + #undef MMID_MATRIX_ROW } // ggml_compute_forward_out_prod @@ -10158,19 +10325,17 @@ static void ggml_compute_forward_out_prod( static void ggml_compute_forward_scale_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } // scale factor - const float v = *(float *) src1->data; + const float v = *(float *) dst->op_params; const int ith = params->ith; const int nth = params->nth; @@ -10201,12 +10366,11 @@ static void ggml_compute_forward_scale_f32( static void ggml_compute_forward_scale( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_scale_f32(params, src0, src1, dst); + ggml_compute_forward_scale_f32(params, src0, dst); } break; default: { @@ -11395,10 +11559,13 @@ static void ggml_compute_forward_rope_f32( } } else { // TODO: this might be wrong for ne0 != n_dims - need double check - // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + // it seems we have to rope just the first n_dims elements and do nothing with the rest + // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 theta_base *= freq_scale; - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { + for (int64_t ic = 0; ic < ne0; ic += 2) { + if (ic < n_dims) { + const int64_t ib = 0; + // simplified from `(ib * n_dims + ic) * inv_ndims` float cur_rot = inv_ndims * ic - ib; @@ -11421,6 +11588,14 @@ static void ggml_compute_forward_rope_f32( dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + const int64_t i0 = ic; + + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } @@ -11548,10 +11723,13 @@ static void ggml_compute_forward_rope_f16( } } else { // TODO: this might be wrong for ne0 != n_dims - need double check - // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + // it seems we have to rope just the first n_dims elements and do nothing with the rest + // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 theta_base *= freq_scale; - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { + for (int64_t ic = 0; ic < ne0; ic += 2) { + if (ic < n_dims) { + const int64_t ib = 0; + // simplified from `(ib * n_dims + ic) * inv_ndims` float cur_rot = inv_ndims * ic - ib; @@ -11574,6 +11752,14 @@ static void ggml_compute_forward_rope_f16( dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } else { + const int64_t i0 = ic; + + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } @@ -14182,7 +14368,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]); + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_MUL_MAT_ID: { @@ -14194,7 +14380,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_SCALE: { - ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_scale(params, tensor->src[0], tensor); } break; case GGML_OP_SET: { @@ -14558,7 +14744,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( return replacements->vals[i]; } - struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); + struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne); // insert clone into replacements GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite @@ -14650,7 +14836,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) { if (ggml_hash_contains(zero_table, a)) { - struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0)); + struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); } else { return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); @@ -14786,7 +14972,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad, ggml_scale(ctx, ggml_mul(ctx, src0, tensor->grad), - ggml_new_f32(ctx, 2.0f)), + 2.0f), zero_table); } } break; @@ -14800,7 +14986,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_div(ctx, tensor->grad, tensor), - ggml_new_f32(ctx, 0.5f)), + 0.5f), zero_table); } } break; @@ -14966,17 +15152,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { + const float s = ((float *) tensor->op_params)[0]; + src0->grad = ggml_add_or_set(ctx, src0->grad, - ggml_scale_impl(ctx, tensor->grad, src1, false), - zero_table); - } - if (src1->grad) { - src1->grad = - ggml_add_or_set(ctx, - src1->grad, - ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)), + ggml_scale_impl(ctx, tensor->grad, s, false), zero_table); } } break; @@ -15982,7 +16163,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_MUL_MAT_ID: { - // FIXME: blas n_tasks = n_threads; } break; case GGML_OP_OUT_PROD: @@ -16311,25 +16491,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } else #endif if (node->src[1]->type != vec_dot_type) { - cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type); + cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); } } break; case GGML_OP_MUL_MAT_ID: { - const struct ggml_tensor * a = node->src[2]; - const struct ggml_tensor * b = node->src[1]; - const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) { - if (a->type != GGML_TYPE_F32) { - // here we need memory just for single 2D matrix from src0 - cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]); - } - } else -#endif - if (b->type != vec_dot_type) { - cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type); + const struct ggml_tensor * src0 = node->src[2]; + const struct ggml_tensor * src1 = node->src[1]; + const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; + if (src1->type != vec_dot_type) { + cur = ggml_row_size(vec_dot_type, ggml_nelements(src1)); } + const int n_as = ggml_get_op_params_i32(node, 1); + cur = GGML_PAD(cur, sizeof(int64_t)); // align + cur += n_as * sizeof(int64_t); // matrix_row_counts + cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows } break; case GGML_OP_OUT_PROD: { @@ -16559,7 +16735,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", ggml_type_name(tensor->type), ggml_op_name (tensor->op), - tensor->n_dims, + ggml_n_dims(tensor), ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], tensor->data, @@ -16574,7 +16750,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), - tensor->n_dims, + ggml_n_dims(tensor), ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], tensor->data, @@ -16664,11 +16840,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { const uint32_t type = tensor->type; const uint32_t op = tensor->op; - const uint32_t n_dims = tensor->n_dims; fwrite(&type, sizeof(uint32_t), 1, fout); fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&n_dims, sizeof(uint32_t), 1, fout); for (int j = 0; j < GGML_MAX_DIMS; ++j) { const uint64_t ne = tensor->ne[j]; @@ -16698,11 +16872,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { const uint32_t type = tensor->type; const uint32_t op = tensor->op; - const uint32_t n_dims = tensor->n_dims; fwrite(&type, sizeof(uint32_t), 1, fout); fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&n_dims, sizeof(uint32_t), 1, fout); for (int j = 0; j < GGML_MAX_DIMS; ++j) { const uint64_t ne = tensor->ne[j]; @@ -16874,12 +17046,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * { uint32_t type; uint32_t op; - uint32_t n_dims; for (uint32_t i = 0; i < n_leafs; ++i) { type = *(const uint32_t *) ptr; ptr += sizeof(type); op = *(const uint32_t *) ptr; ptr += sizeof(op); - n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; @@ -16895,7 +17065,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * nb[j] = nb_cur; } - struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne); tensor->op = (enum ggml_op) op; @@ -16912,7 +17082,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * ptr += ggml_nbytes(tensor); - fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); } } @@ -16922,12 +17092,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * { uint32_t type; uint32_t op; - uint32_t n_dims; for (uint32_t i = 0; i < n_nodes; ++i) { type = *(const uint32_t *) ptr; ptr += sizeof(type); op = *(const uint32_t *) ptr; ptr += sizeof(op); - n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); enum ggml_op eop = (enum ggml_op) op; @@ -16998,7 +17166,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * } break; default: { - tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne); tensor->op = eop; } break; @@ -17017,7 +17185,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * result->nodes[i] = tensor; - fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); } } } @@ -17155,7 +17323,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "(%s)|", ggml_type_name(node->type)); } - if (node->n_dims == 2) { + if (ggml_is_matrix(node)) { fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op)); } else { fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op)); @@ -17422,7 +17590,7 @@ static enum ggml_opt_result ggml_opt_adam( int64_t i = 0; for (int p = 0; p < np; ++p) { const int64_t ne = ggml_nelements(ps[p]); - const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched; + const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched; for (int64_t j = 0; j < ne; ++j) { float x = ggml_get_f32_1d(ps[p], j); float g_ = g[i]*gnorm; @@ -18696,7 +18864,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } - const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type); + const size_t size_cur = ggml_row_size(info->type, ne); ctx->size += GGML_PAD(size_cur, ctx->alignment); } @@ -19025,6 +19193,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) { return ctx->infos[i].name.data; } +enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) { + return ctx->infos[i].type; +} + // returns the index static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { const int idx = gguf_find_key(ctx, key); @@ -19200,8 +19372,8 @@ void gguf_add_tensor( ctx->infos[idx].ne[i] = 1; } - ctx->infos[idx].n_dims = tensor->n_dims; - for (int i = 0; i < tensor->n_dims; i++) { + ctx->infos[idx].n_dims = ggml_n_dims(tensor); + for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) { ctx->infos[idx].ne[i] = tensor->ne[i]; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index afca85143..f3df8a8c6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -54,7 +54,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); - std::vector dataq(ggml_type_size(tensor->type)*size/ggml_blck_size(tensor->type)); + std::vector dataq(ggml_row_size(tensor->type, size)); int64_t hist[16]; ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist); ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); @@ -72,6 +72,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) { ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type); size_t bs = ggml_blck_size(t->type); + std::vector vq(ggml_blck_size(t->type)); + bool quantized = ggml_is_quantized(t->type); // access elements by index to avoid gaps in views for (int64_t i3 = 0; i3 < t->ne[3]; i3++) { @@ -85,9 +87,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) { tv.push_back(*(float *) &buf[i]); } else if (t->type == GGML_TYPE_I32) { tv.push_back((float)*(int32_t *) &buf[i]); - } else if (ggml_is_quantized(t->type)) { - std::vector vq(ggml_blck_size(t->type)); - tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type)); + } else if (quantized) { + tt.to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ASSERT(false); @@ -765,18 +766,19 @@ struct test_bin_bcast : public test_case { struct test_scale : public test_case { const ggml_type type; const std::array ne; + float scale; std::string vars() override { - return VARS_TO_STR2(type, ne); + return VARS_TO_STR3(type, ne, scale); } test_scale(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}) - : type(type), ne(ne) {} + std::array ne = {10, 10, 10, 10}, + float scale = 2.0f) + : type(type), ne(ne), scale(scale) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); - ggml_tensor * scale = ggml_new_tensor_1d(ctx, type, 1); ggml_tensor * out = ggml_scale(ctx, a, scale); return out; } @@ -1554,6 +1556,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm) + test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2) } test_cases.emplace_back(new test_alibi()); diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp index 37522fc0b..79edddb0e 100644 --- a/tests/test-conv1d.cpp +++ b/tests/test-conv1d.cpp @@ -59,8 +59,8 @@ void load_model(test_model & model, bool use_gpu = false) { size_t buffer_size = 0; { - buffer_size += K * IC * OC * ggml_type_sizef(GGML_TYPE_F16); // tensor a - buffer_size += IL * IC * N * ggml_type_sizef(GGML_TYPE_F32); // tensor b + buffer_size += K * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a + buffer_size += IL * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b buffer_size += 1024; // overhead } diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index be318c43c..00287faa2 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -59,8 +59,8 @@ void load_model(test_model & model, bool use_gpu = false) { size_t buffer_size = 0; { - buffer_size += KW * KH * IC * OC * ggml_type_sizef(GGML_TYPE_F16); // tensor a - buffer_size += IW * IH * IC * N * ggml_type_sizef(GGML_TYPE_F32); // tensor b + buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a + buffer_size += IW * IH * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b buffer_size += 1024; // overhead } diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp index 81c20a89c..14914def5 100644 --- a/tests/test-grad0.cpp +++ b/tests/test-grad0.cpp @@ -881,19 +881,19 @@ int main(int argc, const char ** argv) { // scale { srand(seed); - const int nargs = 2; + const int nargs = 1; int64_t ne2[4]; ne2[0] = 1; for (int ndims = 1; ndims <= 2; ++ndims) { - x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + const float s = -1.0f + 2.0f*frand(); + ggml_set_param(ctx0, x[0]); - ggml_set_param(ctx0, x[1]); - struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1])); + struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s)); check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); } @@ -1395,7 +1395,7 @@ int main(int argc, const char ** argv) { ggml_add1(ctx0, ggml_scale(ctx0, ggml_soft_max(ctx0, x[0]), - ggml_new_f32(ctx0, 1.0f - eps)), + 1.0f - eps), ggml_new_f32(ctx0, eps)))); check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY); diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp index 2bee73393..99359e2f4 100644 --- a/tests/test-mul-mat.cpp +++ b/tests/test-mul-mat.cpp @@ -39,8 +39,8 @@ struct test_model { void load_model(test_model & model, float* a, float* b, int M, int N, int K, bool use_gpu = false) { size_t buffer_size = 0; { - buffer_size += (M * N) * ggml_type_sizef(GGML_TYPE_F32); // tensor a - buffer_size += (N * K) * ggml_type_sizef(GGML_TYPE_F32); // tensor b + buffer_size += (M * N) * ggml_type_size(GGML_TYPE_F32); // tensor a + buffer_size += (N * K) * ggml_type_size(GGML_TYPE_F32); // tensor b buffer_size += 1024; // overhead } diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 62d0190f9..09d410b7f 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -286,7 +286,7 @@ int main(int argc, char * argv[]) { qfns.from_float_reference(test_data1, test_q1, size); return test_q1[0]; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); @@ -300,7 +300,7 @@ int main(int argc, char * argv[]) { qfns.from_float(test_data1, test_q1, size); return test_q1[0]; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); @@ -315,7 +315,7 @@ int main(int argc, char * argv[]) { qfns.to_float(test_q1, test_out, size); return test_out[0]; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); @@ -330,7 +330,7 @@ int main(int argc, char * argv[]) { vdot.from_float(test_data1, test_q1, size); return test_q1[0]; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); @@ -347,7 +347,7 @@ int main(int argc, char * argv[]) { qfns.vec_dot(size, &result, test_q1, test_q2); return result; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); diff --git a/tests/test0.c b/tests/test0.c index 7fba63e77..2d2fa85bb 100644 --- a/tests/test0.c +++ b/tests/test0.c @@ -16,17 +16,17 @@ int main(int argc, const char ** argv) { struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx0, GGML_TYPE_I16, 10, 20); struct ggml_tensor * t3 = ggml_new_tensor_3d(ctx0, GGML_TYPE_I32, 10, 20, 30); - GGML_ASSERT(t1->n_dims == 1); + GGML_ASSERT(ggml_n_dims(t1) == 1); GGML_ASSERT(t1->ne[0] == 10); GGML_ASSERT(t1->nb[1] == 10*sizeof(float)); - GGML_ASSERT(t2->n_dims == 2); + GGML_ASSERT(ggml_n_dims(t2) == 2); GGML_ASSERT(t2->ne[0] == 10); GGML_ASSERT(t2->ne[1] == 20); GGML_ASSERT(t2->nb[1] == 10*sizeof(int16_t)); GGML_ASSERT(t2->nb[2] == 10*20*sizeof(int16_t)); - GGML_ASSERT(t3->n_dims == 3); + GGML_ASSERT(ggml_n_dims(t3) == 3); GGML_ASSERT(t3->ne[0] == 10); GGML_ASSERT(t3->ne[1] == 20); GGML_ASSERT(t3->ne[2] == 30); From 8624146bcf7b8da3ace96be13db1085d79cd63f6 Mon Sep 17 00:00:00 2001 From: Herman Semenov Date: Fri, 22 Dec 2023 09:26:49 +0000 Subject: [PATCH 2/6] ggml : add comment about backward GGML_OP_DIAG_MASK_INF (#4203) --- src/ggml.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ggml.c b/src/ggml.c index f27920a2d..15e1984d1 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -15335,6 +15335,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = ggml_add_or_set(ctx, src0->grad, + /* ggml_diag_mask_inf_impl() shouldn't be here */ + /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */ ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), zero_table); } From 1a58054d568959b10839a747f01b3f890160c820 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 22 Dec 2023 12:12:53 +0100 Subject: [PATCH 3/6] llama : fix platforms without mmap (#4578) * llama : fix platforms without mmap * win32 : limit prefetch size to the file size * fix win32 error clobber, unnecessary std::string in std::runtime_error --- src/ggml-cuda.cu | 3 ++- src/ggml.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 11d4da845..a56a8fda3 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7702,7 +7702,8 @@ inline void ggml_cuda_op_scale( GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - const float scale = ((float *) dst->op_params)[0]; + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); diff --git a/src/ggml.c b/src/ggml.c index 15e1984d1..3656422d7 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -10335,7 +10335,8 @@ static void ggml_compute_forward_scale_f32( } // scale factor - const float v = *(float *) dst->op_params; + float v; + memcpy(&v, dst->op_params, sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -15152,7 +15153,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - const float s = ((float *) tensor->op_params)[0]; + float s; + memcpy(&s, tensor->op_params, sizeof(float)); src0->grad = ggml_add_or_set(ctx, From b9e5491e1b7462e4b66ed10c7538181942cdb09b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 22 Dec 2023 13:18:15 +0200 Subject: [PATCH 4/6] ggml-alloc : fix ggml_tallocr_is_own --- src/ggml-alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index a97436b17..a27dd54b0 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -72,7 +72,7 @@ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * t // check if a tensor is allocated by this buffer static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) { - return tensor->buffer == alloc->buffer; + return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer); } static bool ggml_is_view(struct ggml_tensor * t) { From fec35d6b27e17629d0d3da9724683257f093a9ca Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 22 Dec 2023 13:19:03 +0200 Subject: [PATCH 5/6] whisper : minor --- examples/whisper/whisper.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp index d0eb59077..8cb342ba9 100644 --- a/examples/whisper/whisper.cpp +++ b/examples/whisper/whisper.cpp @@ -487,8 +487,8 @@ static size_t whisper_allocr_size(struct whisper_allocr & allocr) { // measure the memory usage of a graph and prepare the allocr's internal data buffer static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function && get_graph) { - auto & alloc = allocr.alloc; - auto & meta = allocr.meta; + auto & alloc = allocr.alloc; + auto & meta = allocr.meta; alloc = ggml_allocr_new_measure_from_backend(backend); @@ -6110,7 +6110,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) { // multi-thread - for (uint32_t k = 1; k <= n_threads; k++) { + for (int32_t k = 1; k <= n_threads; k++) { char * src = (char *) malloc(size); char * dst = (char *) malloc(size); @@ -6134,13 +6134,13 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) { const int64_t t0 = ggml_time_us(); std::vector threads(k - 1); - for (uint32_t th = 0; th < k - 1; ++th) { + for (int32_t th = 0; th < k - 1; ++th) { threads[th] = std::thread(helper, th); } helper(k - 1); - for (uint32_t th = 0; th < k - 1; ++th) { + for (int32_t th = 0; th < k - 1; ++th) { threads[th].join(); } From f7b92db7f6eb54cbad75c5a8d749b2b8a37352c5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 22 Dec 2023 17:42:36 +0200 Subject: [PATCH 6/6] ggml : cuda jetson + arm quants warnings ggml-ci --- src/ggml-cuda.cu | 9 ++++++++- src/ggml-quants.c | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index a56a8fda3..7c2a834e3 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -90,6 +90,13 @@ #include #include #include +// CUDA 10.2 does not have these macro definitions. +#ifndef CUBLAS_TF32_TENSOR_OP_MATH +#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define cublasComputeType_t cudaDataType_t +#endif #endif // defined(GGML_USE_HIPBLAS) #include "ggml-cuda.h" @@ -8843,7 +8850,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ? - cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; + cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice; for (int32_t row_id = 0; row_id < n_as; ++row_id) { const struct ggml_tensor * src0_row = dst->src[row_id + 2]; diff --git a/src/ggml-quants.c b/src/ggml-quants.c index 0e8163a16..a15a24048 100644 --- a/src/ggml-quants.c +++ b/src/ggml-quants.c @@ -3677,7 +3677,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4); const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); - const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}; + const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}}; const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])), vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0]))); const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])), @@ -6626,7 +6626,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); const int8x16_t scales = vld1q_s8(scale); - const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}; + const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}}; const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])), vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),