From 65648b341f786ca724beb6e288d748794a21d8ea Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 14 Jan 2024 16:48:16 +0200 Subject: [PATCH 1/9] backend : add eval callback ggml-ci --- examples/simple/simple.cpp | 36 ++++++++++++++++++++++++++++++++++-- ggml-backend.c | 38 ++++++++++++++++++++++++++++++++++++-- ggml-backend.h | 7 +++++++ llama.cpp | 9 +++++++++ llama.h | 4 ++++ 5 files changed, 90 insertions(+), 4 deletions(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 9cfde8308f18f..b3ae68492ecfd 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -6,11 +6,36 @@ #include #include +// a function that can be called for every computed node during graph evaluation +// the user can choose to whether to observe the data of the node depending on the tensor parameters +static bool observe_compute(int node_index, struct ggml_tensor * t, void * user_data) { + GGML_UNUSED(user_data); + + // check if name contains soft_max + if (strstr(t->name, "soft_max") != 0) { + printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n", + __func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); + + std::vector t_data(ggml_nelements(t)); + ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t)); + + // print first row + for (int i = 0; i < t->ne[0]; i++) { + printf("%8.4f ", t_data[i]); + } + printf("\n"); + } + + return true; +} + int main(int argc, char ** argv) { gpt_params params; + bool observe = false; + if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]); + printf("usage: %s MODEL_PATH [PROMPT] [OBSERV]\n" , argv[0]); return 1 ; } @@ -22,6 +47,10 @@ int main(int argc, char ** argv) { params.prompt = argv[2]; } + if (argc >= 4) { + observe = atoi(argv[3]); + } + if (params.prompt.empty()) { params.prompt = "Hello my name is"; } @@ -37,7 +66,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); - // model_params.n_gpu_layers = 99; // offload all layers to the GPU + model_params.n_gpu_layers = 99; // offload all layers to the GPU llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -55,6 +84,9 @@ int main(int argc, char ** argv) { ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + ctx_params.cb_eval = observe ? observe_compute : NULL; + ctx_params.cb_eval_user_data = NULL; + llama_context * ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { diff --git a/ggml-backend.c b/ggml-backend.c index 505dbba476253..ee78f45fa92bd 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -802,6 +802,9 @@ struct ggml_backend_sched { __attribute__((aligned(GGML_MEM_ALIGN))) #endif char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; + + ggml_backend_sched_eval_callback callback_eval; + void * callback_eval_user_data; }; #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node) @@ -1324,9 +1327,30 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { ggml_graph_dump_dot(split->graph, NULL, split_filename); #endif + uint64_t compute_start_us = ggml_time_us(); - ggml_backend_graph_compute(split_backend, &split->graph); - //ggml_backend_synchronize(split_backend); // necessary to measure compute time + if (!sched->callback_eval) { + ggml_backend_graph_compute(split_backend, &split->graph); + //ggml_backend_synchronize(split_backend); // necessary to measure compute time + } else { + // similar to ggml_backend_compare_graph_backend + for (int j = 0; j < split->graph.n_nodes; j++) { + struct ggml_tensor * t = split->graph.nodes[j]; + + struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, j + 1); + + ggml_backend_graph_compute(split_backend, &gv); + + if (ggml_is_view_op(t->op)) { + continue; + } + + // TODO: j is node index in the split, not in the original graph + if (!sched->callback_eval(j, t, sched->callback_eval_user_data)) { + break; + } + } + } uint64_t compute_end_us = ggml_time_us(); compute_us[split_backend_id] += compute_end_us - compute_start_us; } @@ -1352,6 +1376,10 @@ static void sched_reset(ggml_backend_sched_t sched) { memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + // TODO: should we clear the callbacks? + //sched->callback_eval = NULL; + //sched->callback_eval_user_data = NULL; + sched->is_reset = true; } @@ -1431,6 +1459,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { sched_reset(sched); } + +void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { + sched->callback_eval = callback; + sched->callback_eval_user_data = user_data; +} + int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { return sched->n_splits; } diff --git a/ggml-backend.h b/ggml-backend.h index 4eb244af1d3e7..057ed120189c4 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -148,6 +148,9 @@ extern "C" { struct ggml_backend_sched; typedef struct ggml_backend_sched * ggml_backend_sched_t; + // TODO: propose to rename to ggml_backend_sched_callback_eval + typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, void * user_data); + // Initialize a backend scheduler GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); @@ -168,6 +171,9 @@ extern "C" { // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); + // Set a callback to be called for each resulting node during graph compute + GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); + // // Utils // @@ -183,6 +189,7 @@ extern "C" { GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); + // TODO: propose to rename this to ggml_backend_callback_compare typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends diff --git a/llama.cpp b/llama.cpp index 46c4d11c88873..5c1b211702f37 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1393,6 +1393,9 @@ struct llama_cparams { bool mul_mat_q; bool offload_kqv; + + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; }; struct llama_layer { @@ -6254,6 +6257,7 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, batch); @@ -9267,6 +9271,8 @@ struct llama_context_params llama_context_default_params() { /*.logits_all =*/ false, /*.embedding =*/ false, /*.offload_kqv =*/ true, + /*.cb_eval =*/ nullptr, + /*.cb_eval_user_data =*/ nullptr, }; return result; @@ -9401,6 +9407,9 @@ struct llama_context * llama_new_context_with_model( hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx : hparams.n_ctx_train; + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_user_data = params.cb_eval_user_data; + auto rope_scaling_type = params.rope_scaling_type; if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { rope_scaling_type = hparams.rope_scaling_type_train; diff --git a/llama.h b/llama.h index a570b0d6968fb..9f7a51a0f3aeb 100644 --- a/llama.h +++ b/llama.h @@ -2,6 +2,7 @@ #define LLAMA_H #include "ggml.h" +#include "ggml-backend.h" #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES @@ -239,6 +240,9 @@ extern "C" { bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; }; // model quantization parameters From 01b6f68a003e4de97098001ae9650ee1c3645b13 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 14 Jan 2024 17:30:22 +0200 Subject: [PATCH 2/9] backend : group nodes in a single compute when user don't need them --- examples/simple/simple.cpp | 28 ++++++++++++++++------------ ggml-backend.c | 21 ++++++++++++++------- ggml-backend.h | 8 +++++++- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index b3ae68492ecfd..dac7aa60afb1d 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -8,23 +8,27 @@ // a function that can be called for every computed node during graph evaluation // the user can choose to whether to observe the data of the node depending on the tensor parameters -static bool observe_compute(int node_index, struct ggml_tensor * t, void * user_data) { +static bool observe_compute(int node_index, struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); - // check if name contains soft_max - if (strstr(t->name, "soft_max") != 0) { - printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n", - __func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); + // the scheduler is asking us if we want to observe this node + if (ask) { + // check if name contains soft_max + return strstr(t->name, "soft_max") != 0; + } - std::vector t_data(ggml_nelements(t)); - ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t)); + // print the node data + printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n", + __func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); - // print first row - for (int i = 0; i < t->ne[0]; i++) { - printf("%8.4f ", t_data[i]); - } - printf("\n"); + std::vector t_data(ggml_nelements(t)); + ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t)); + + // print first row + for (int i = 0; i < t->ne[0]; i++) { + printf("%8.4f ", t_data[i]); } + printf("\n"); return true; } diff --git a/ggml-backend.c b/ggml-backend.c index ee78f45fa92bd..0ec46ed32fe81 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1337,18 +1337,25 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { for (int j = 0; j < split->graph.n_nodes; j++) { struct ggml_tensor * t = split->graph.nodes[j]; - struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, j + 1); + int k = j; - ggml_backend_graph_compute(split_backend, &gv); - - if (ggml_is_view_op(t->op)) { - continue; + // check if the user needs data from this node + while (!sched->callback_eval(k, t, true, sched->callback_eval_user_data) && k < split->graph.n_nodes - 1) { + t = split->graph.nodes[++k]; } - // TODO: j is node index in the split, not in the original graph - if (!sched->callback_eval(j, t, sched->callback_eval_user_data)) { + struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, k + 1); + + ggml_backend_graph_compute(split_backend, &gv); + + // TODO: k is node index in the split, not in the original graph + // TODO: avoid the ask == true call here + if (sched->callback_eval(k, t, true, sched->callback_eval_user_data) && + !sched->callback_eval(k, t, false, sched->callback_eval_user_data)) { break; } + + j = k; } } uint64_t compute_end_us = ggml_time_us(); diff --git a/ggml-backend.h b/ggml-backend.h index 057ed120189c4..0d4ff69ba17a2 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -148,8 +148,14 @@ extern "C" { struct ggml_backend_sched; typedef struct ggml_backend_sched * ggml_backend_sched_t; + // when ask == true, the scheduler wants to know if the user wants to observe this node + // this allows the scheduler to batch nodes together in order to evaluate them in a single call + // + // when ask == false, the scheduler is passing the node tensor to the user for observation + // if the user returns false, the scheduler will cancel the graph compute + // // TODO: propose to rename to ggml_backend_sched_callback_eval - typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, void * user_data); + typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, bool ask, void * user_data); // Initialize a backend scheduler GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); From 83f3d7a83c6eb9691db3f55477cccb3c9fd1cbab Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Jan 2024 15:52:41 +0200 Subject: [PATCH 3/9] backend : clean-up the implementation ggml-ci --- examples/simple/simple.cpp | 11 ++++++----- ggml-backend.c | 27 +++++++++++++++------------ ggml-backend.h | 4 +--- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index dac7aa60afb1d..ce3497345d0d7 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -8,19 +8,20 @@ // a function that can be called for every computed node during graph evaluation // the user can choose to whether to observe the data of the node depending on the tensor parameters -static bool observe_compute(int node_index, struct ggml_tensor * t, bool ask, void * user_data) { +static bool observe_compute(struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); // the scheduler is asking us if we want to observe this node if (ask) { - // check if name contains soft_max + // check if name contains soft_max (customize to your needs) return strstr(t->name, "soft_max") != 0; } - // print the node data - printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n", - __func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); + // print the node info + printf("%s: t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n", + __func__, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); + // this will copy the data to host memory (if needed) std::vector t_data(ggml_nelements(t)); ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t)); diff --git a/ggml-backend.c b/ggml-backend.c index 0ec46ed32fe81..07482bedf2ace 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1334,28 +1334,31 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { //ggml_backend_synchronize(split_backend); // necessary to measure compute time } else { // similar to ggml_backend_compare_graph_backend - for (int j = 0; j < split->graph.n_nodes; j++) { - struct ggml_tensor * t = split->graph.nodes[j]; + for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { + struct ggml_tensor * t = split->graph.nodes[j0]; - int k = j; + int j1 = j0; - // check if the user needs data from this node - while (!sched->callback_eval(k, t, true, sched->callback_eval_user_data) && k < split->graph.n_nodes - 1) { - t = split->graph.nodes[++k]; + // determine the range [j0, j1] of nodes that can be computed together + while (j1 < split->graph.n_nodes - 1) { + // check if the user needs data from this node + if (sched->callback_eval(t, true, sched->callback_eval_user_data)) { + break; + } + + t = split->graph.nodes[++j1]; } - struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, k + 1); + struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); ggml_backend_graph_compute(split_backend, &gv); - // TODO: k is node index in the split, not in the original graph - // TODO: avoid the ask == true call here - if (sched->callback_eval(k, t, true, sched->callback_eval_user_data) && - !sched->callback_eval(k, t, false, sched->callback_eval_user_data)) { + if (sched->callback_eval(t, true, sched->callback_eval_user_data) && // ask + !sched->callback_eval(t, false, sched->callback_eval_user_data)) { // eval break; } - j = k; + j0 = j1; } } uint64_t compute_end_us = ggml_time_us(); diff --git a/ggml-backend.h b/ggml-backend.h index 0d4ff69ba17a2..5cef4d8b475ea 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -154,8 +154,7 @@ extern "C" { // when ask == false, the scheduler is passing the node tensor to the user for observation // if the user returns false, the scheduler will cancel the graph compute // - // TODO: propose to rename to ggml_backend_sched_callback_eval - typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, bool ask, void * user_data); + typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); // Initialize a backend scheduler GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); @@ -195,7 +194,6 @@ extern "C" { GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); - // TODO: propose to rename this to ggml_backend_callback_compare typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends From e1b1db9f09fe73ca8460890b0c93c349685c54d7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Jan 2024 16:42:16 +0200 Subject: [PATCH 4/9] simple : do not perform tensor data copy if not needed --- examples/simple/simple.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index ce3497345d0d7..8db37eef76901 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -22,12 +22,20 @@ static bool observe_compute(struct ggml_tensor * t, bool ask, void * user_data) __func__, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); // this will copy the data to host memory (if needed) - std::vector t_data(ggml_nelements(t)); - ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t)); + static std::vector t_data; + + const bool is_host = ggml_backend_buffer_is_host(t->buffer); + + if (!is_host || ggml_is_contiguous(t)) { + t_data.resize(ggml_nelements(t)); + ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t)); + } + + const float * data = is_host ? (const float *) t->data : t_data.data(); // print first row for (int i = 0; i < t->ne[0]; i++) { - printf("%8.4f ", t_data[i]); + printf("%8.4f ", data[i]); } printf("\n"); From e0493800cecc2e18964f9c6d9db9db249bfca9c8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Jan 2024 16:43:46 +0200 Subject: [PATCH 5/9] simple : fix --- examples/simple/simple.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 8db37eef76901..b83e7a812d1ca 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -26,7 +26,7 @@ static bool observe_compute(struct ggml_tensor * t, bool ask, void * user_data) const bool is_host = ggml_backend_buffer_is_host(t->buffer); - if (!is_host || ggml_is_contiguous(t)) { + if (!is_host || !ggml_is_contiguous(t)) { t_data.resize(ggml_nelements(t)); ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t)); } From 0b2fca9a9f11b8bf12ec640fee7f365f525318ec Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Jan 2024 16:18:11 +0200 Subject: [PATCH 6/9] imatrix : offload to GPU support --- examples/imatrix/imatrix.cpp | 61 ++++++++++++++++++++++++++++-------- ggml.c | 14 --------- ggml.h | 6 ---- 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 1461bc96376a7..e0cb718b24c29 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -33,19 +33,43 @@ class IMatrixCollector { public: IMatrixCollector() = default; void set_parameters(StatParams&& params) { m_params = std::move(params); } - void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1); + bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix() const; private: std::unordered_map m_stats; StatParams m_params; std::mutex m_mutex; int m_last_call = 0; + std::vector m_src1_data; }; -void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { - if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return; - if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return; +bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + GGML_UNUSED(user_data); + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + // when ask is true, the scheduler wants to know if we are interested in data from this tensor + // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection + if (ask) { + if (t->op != GGML_OP_MUL_MAT) return false; + if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; + if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false; + return true; + } + std::lock_guard lock(m_mutex); + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(src1->buffer); + + if (!is_host || !ggml_is_contiguous(src1)) { + m_src1_data.resize(ggml_nelements(src1)); + ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1)); + } + + const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); + auto& e = m_stats[src0->name]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); @@ -59,7 +83,7 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { - const float * x = (const float *)src1->data + row * src1->ne[0]; + const float * x = data + row * src1->ne[0]; for (int j = 0; j < (int)src1->ne[0]; ++j) { e.values[j] += x[j]*x[j]; } @@ -70,6 +94,8 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st save_imatrix(); } } + + return true; } void IMatrixCollector::save_imatrix() const { @@ -93,8 +119,8 @@ void IMatrixCollector::save_imatrix() const { static IMatrixCollector g_collector; -static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { - g_collector.collect_imatrix(src0, src1); +static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + return g_collector.collect_imatrix(t, ask, user_data); } @@ -320,8 +346,6 @@ int main(int argc, char ** argv) { g_collector.set_parameters(std::move(sparams)); - ggml_set_imatrix_collection(ik_collect_imatrix); - params.logits_all = true; params.n_batch = std::min(params.n_batch, params.n_ctx); @@ -340,16 +364,27 @@ int main(int argc, char ** argv) { llama_backend_init(params.numa); - llama_model * model; - llama_context * ctx; + llama_model_params mparams = llama_model_params_from_gpt_params(params); - // load the model and apply lora adapter, if any - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } + llama_context_params cparams = llama_context_params_from_gpt_params(params); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + cparams.cb_eval = ik_collect_imatrix; + cparams.cb_eval_user_data = NULL; + + llama_context * ctx = llama_new_context_with_model(model, cparams); + if (ctx == NULL) { + fprintf(stderr, "%s: error: unable to create context\n", __func__); + return 1; + } + const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", diff --git a/ggml.c b/ggml.c index ef5888ab21538..53197d4775417 100644 --- a/ggml.c +++ b/ggml.c @@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); -ggml_collect_imatrix_t g_imatrix_collect = NULL; - -void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) { - g_imatrix_collect = imatrix_collect; -} - static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", @@ -9790,10 +9784,6 @@ static void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; - if (ith == 1 && g_imatrix_collect) { - g_imatrix_collect(src0, src1); - } - const enum ggml_type type = src0->type; const bool src1_cont = ggml_is_contiguous(src1); @@ -10097,10 +10087,6 @@ static void ggml_compute_forward_mul_mat_id( const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; - if (ith == 1 && g_imatrix_collect) { - g_imatrix_collect(src0_cur, src1); - } - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); diff --git a/ggml.h b/ggml.h index 1187074f7f174..4a2746729c163 100644 --- a/ggml.h +++ b/ggml.h @@ -2075,12 +2075,6 @@ extern "C" { GGML_API void ggml_init_iq2_quantization(enum ggml_type type); GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type); - // - // Importance matrix - // - typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1); - GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect); - // // gguf // From a722d05a873dbec5d9c7a29a97c09c255aef1355 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 17 Jan 2024 14:43:35 +0200 Subject: [PATCH 7/9] imatrix : fix ggml_mul_mat_id hanlding ggml-ci --- examples/imatrix/imatrix.cpp | 95 +++++++++++++++++++++++++++--------- 1 file changed, 73 insertions(+), 22 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index e0cb718b24c29..af78711c5ab66 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -41,6 +41,7 @@ class IMatrixCollector { std::mutex m_mutex; int m_last_call = 0; std::vector m_src1_data; + std::vector m_ids; // the expert ids from ggml_mul_mat_id }; bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { @@ -52,6 +53,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * // when ask is true, the scheduler wants to know if we are interested in data from this tensor // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection if (ask) { + if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications if (t->op != GGML_OP_MUL_MAT) return false; if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false; @@ -63,35 +65,84 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * // copy the data from the GPU memory if needed const bool is_host = ggml_backend_buffer_is_host(src1->buffer); - if (!is_host || !ggml_is_contiguous(src1)) { + if (!is_host) { m_src1_data.resize(ggml_nelements(src1)); ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1)); } const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); - auto& e = m_stats[src0->name]; - if (e.values.empty()) { - e.values.resize(src1->ne[0], 0); - } - else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ASSERT(false); - } - ++e.ncall; - if (m_params.verbosity > 1) { - printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type); - } - for (int row = 0; row < (int)src1->ne[1]; ++row) { - const float * x = data + row * src1->ne[0]; - for (int j = 0; j < (int)src1->ne[0]; ++j) { - e.values[j] += x[j]*x[j]; + if (t->op == GGML_OP_MUL_MAT_ID) { + const int idx = ((int32_t *) t->op_params)[0]; + const int n_as = ((int32_t *) t->op_params)[1]; + + // the top-k selected expert ids are stored in the src0 tensor + // for simplicity, always copy src0 to host, because it is small + // take into account that src0 is not contiguous! + GGML_ASSERT(src0->ne[1] == src1->ne[1]); + GGML_ASSERT(n_as*ggml_nrows(src0)); + m_ids.resize(ggml_nbytes(src0)/sizeof(int)); + ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0)); + + // loop over all possible experts, regardless if they are used or not in the batch + // this is necessary to guarantee equal number of "ncall" for each tensor + for (int ex = 0; ex < n_as; ++ex) { + src0 = t->src[2 + ex]; + auto& e = m_stats[src0->name]; + if (e.values.empty()) { + e.values.resize(src1->ne[0], 0); + } + else if (e.values.size() != (size_t)src1->ne[0]) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); + } + // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger + // using the following line, we can correct for that if needed + //if (idx == t->src[0]->ne[0] - 1) ++e.ncall; + ++e.ncall; + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const int excur = m_ids[row*n_as + idx]; + GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check + if (excur != ex) continue; + const float * x = data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } + } } - } - if (e.ncall > m_last_call) { - m_last_call = e.ncall; - if (m_last_call % m_params.n_output_frequency == 0) { - save_imatrix(); + } else { + auto& e = m_stats[src0->name]; + if (e.values.empty()) { + e.values.resize(src1->ne[0], 0); + } + else if (e.values.size() != (size_t)src1->ne[0]) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); + } + ++e.ncall; + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const float * x = data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } } } From 10b25e0388de04aeaa141bd70648ddaa20904a70 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 17 Jan 2024 15:10:38 +0200 Subject: [PATCH 8/9] ci : add imatrix test ggml-ci --- ci/run.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/run.sh b/ci/run.sh index 47a254f4cf1e8..08619f2693990 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -214,6 +214,8 @@ function gg_run_open_llama_3b_v2 { (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { @@ -241,6 +243,8 @@ function gg_run_open_llama_3b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log + # lora function compare_ppl { qnt="$1" @@ -282,7 +286,6 @@ function gg_run_open_llama_3b_v2 { (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - set +e } @@ -304,6 +307,7 @@ function gg_sum_open_llama_3b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" @@ -391,6 +395,8 @@ function gg_run_open_llama_7b_v2 { (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { @@ -418,6 +424,8 @@ function gg_run_open_llama_7b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log + # lora function compare_ppl { qnt="$1" @@ -481,6 +489,7 @@ function gg_sum_open_llama_7b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" From 4fb52843bb149ff178c5ef4b268fe1f1cd3f49aa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 17 Jan 2024 15:27:34 +0200 Subject: [PATCH 9/9] ci : rearrange output ggml-ci --- ci/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index 08619f2693990..1d678d45aacfe 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -295,6 +295,7 @@ function gg_sum_open_llama_3b_v2 { gg_printf 'OpenLLaMA 3B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" @@ -307,7 +308,6 @@ function gg_sum_open_llama_3b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" - gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" @@ -477,6 +477,7 @@ function gg_sum_open_llama_7b_v2 { gg_printf 'OpenLLaMA 7B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" @@ -489,7 +490,6 @@ function gg_sum_open_llama_7b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" - gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"