From 65648b341f786ca724beb6e288d748794a21d8ea Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 14 Jan 2024 16:48:16 +0200
Subject: [PATCH 1/9] backend : add eval callback

ggml-ci
---
 examples/simple/simple.cpp | 36 ++++++++++++++++++++++++++++++++++--
 ggml-backend.c             | 38 ++++++++++++++++++++++++++++++++++++--
 ggml-backend.h             |  7 +++++++
 llama.cpp                  |  9 +++++++++
 llama.h                    |  4 ++++
 5 files changed, 90 insertions(+), 4 deletions(-)
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 9cfde8308f18f..b3ae68492ecfd 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -6,11 +6,36 @@
 #include <string>
 #include <vector>
 
+// a function that can be called for every computed node during graph evaluation
+// the user can choose to whether to observe the data of the node depending on the tensor parameters
+static bool observe_compute(int node_index, struct ggml_tensor * t, void * user_data) {
+    GGML_UNUSED(user_data);
+
+    // check if name contains soft_max
+    if (strstr(t->name, "soft_max") != 0) {
+        printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n",
+                __func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+
+        std::vector<float> t_data(ggml_nelements(t));
+        ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t));
+
+        // print first row
+        for (int i = 0; i < t->ne[0]; i++) {
+            printf("%8.4f ", t_data[i]);
+        }
+        printf("\n");
+    }
+
+    return true;
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
+    bool observe = false;
+
     if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [OBSERV]\n" , argv[0]);
         return 1 ;
     }
 
@@ -22,6 +47,10 @@ int main(int argc, char ** argv) {
         params.prompt = argv[2];
     }
 
+    if (argc >= 4) {
+        observe = atoi(argv[3]);
+    }
+
     if (params.prompt.empty()) {
         params.prompt = "Hello my name is";
     }
@@ -37,7 +66,7 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = llama_model_default_params();
 
-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+    model_params.n_gpu_layers = 99; // offload all layers to the GPU
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -55,6 +84,9 @@ int main(int argc, char ** argv) {
     ctx_params.n_threads = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
 
+    ctx_params.cb_eval = observe ? observe_compute : NULL;
+    ctx_params.cb_eval_user_data = NULL;
+
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
     if (ctx == NULL) {
diff --git a/ggml-backend.c b/ggml-backend.c
index 505dbba476253..ee78f45fa92bd 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -802,6 +802,9 @@ struct ggml_backend_sched {
     __attribute__((aligned(GGML_MEM_ALIGN)))
     #endif
     char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
+
+    ggml_backend_sched_eval_callback callback_eval;
+    void * callback_eval_user_data;
 };
 
 #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
@@ -1324,9 +1327,30 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
         ggml_graph_dump_dot(split->graph, NULL, split_filename);
 #endif
 
+
         uint64_t compute_start_us = ggml_time_us();
-        ggml_backend_graph_compute(split_backend, &split->graph);
-        //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+        if (!sched->callback_eval) {
+            ggml_backend_graph_compute(split_backend, &split->graph);
+          //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+        } else {
+            // similar to ggml_backend_compare_graph_backend
+            for (int j = 0; j < split->graph.n_nodes; j++) {
+                struct ggml_tensor * t = split->graph.nodes[j];
+
+                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, j + 1);
+
+                ggml_backend_graph_compute(split_backend, &gv);
+
+                if (ggml_is_view_op(t->op)) {
+                    continue;
+                }
+
+                // TODO: j is node index in the split, not in the original graph
+                if (!sched->callback_eval(j, t, sched->callback_eval_user_data)) {
+                    break;
+                }
+            }
+        }
         uint64_t compute_end_us = ggml_time_us();
         compute_us[split_backend_id] += compute_end_us - compute_start_us;
     }
@@ -1352,6 +1376,10 @@ static void sched_reset(ggml_backend_sched_t sched) {
     memset(sched->node_talloc,   0, sizeof(sched->node_talloc[0])   * hash_size);
     memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
 
+    // TODO: should we clear the callbacks?
+    //sched->callback_eval = NULL;
+    //sched->callback_eval_user_data = NULL;
+
     sched->is_reset = true;
 }
 
@@ -1431,6 +1459,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     sched_reset(sched);
 }
 
+
+void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
+    sched->callback_eval = callback;
+    sched->callback_eval_user_data = user_data;
+}
+
 int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
     return sched->n_splits;
 }
diff --git a/ggml-backend.h b/ggml-backend.h
index 4eb244af1d3e7..057ed120189c4 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -148,6 +148,9 @@ extern "C" {
     struct ggml_backend_sched;
     typedef struct ggml_backend_sched * ggml_backend_sched_t;
 
+    // TODO: propose to rename to ggml_backend_sched_callback_eval
+    typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, void * user_data);
+
     // Initialize a backend scheduler
     GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
     GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
@@ -168,6 +171,9 @@ extern "C" {
     // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
     GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
 
+    // Set a callback to be called for each resulting node during graph compute
+    GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
     //
     // Utils
     //
@@ -183,6 +189,7 @@ extern "C" {
     GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
     GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
 
+    // TODO: propose to rename this to ggml_backend_callback_compare
     typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
 
     // Compare the output of two backends
diff --git a/llama.cpp b/llama.cpp
index 46c4d11c88873..5c1b211702f37 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1393,6 +1393,9 @@ struct llama_cparams {
 
     bool mul_mat_q;
     bool offload_kqv;
+
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
 };
 
 struct llama_layer {
@@ -6254,6 +6257,7 @@ static int llama_decode_internal(
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
     ggml_backend_sched_reset(lctx.sched);
+    ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
     ggml_cgraph * gf = llama_build_graph(lctx, batch);
 
@@ -9267,6 +9271,8 @@ struct llama_context_params llama_context_default_params() {
         /*.logits_all                  =*/ false,
         /*.embedding                   =*/ false,
         /*.offload_kqv                 =*/ true,
+        /*.cb_eval                     =*/ nullptr,
+        /*.cb_eval_user_data           =*/ nullptr,
     };
 
     return result;
@@ -9401,6 +9407,9 @@ struct llama_context * llama_new_context_with_model(
                                hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
                                                               hparams.n_ctx_train;
 
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
+
     auto rope_scaling_type = params.rope_scaling_type;
     if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
         rope_scaling_type = hparams.rope_scaling_type_train;
diff --git a/llama.h b/llama.h
index a570b0d6968fb..9f7a51a0f3aeb 100644
--- a/llama.h
+++ b/llama.h
@@ -2,6 +2,7 @@
 #define LLAMA_H
 
 #include "ggml.h"
+#include "ggml-backend.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
@@ -239,6 +240,9 @@ extern "C" {
         bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         bool embedding;   // embedding mode only
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+
+        ggml_backend_sched_eval_callback cb_eval;
+        void * cb_eval_user_data;
     };
 
     // model quantization parameters

From 01b6f68a003e4de97098001ae9650ee1c3645b13 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 14 Jan 2024 17:30:22 +0200
Subject: [PATCH 2/9] backend : group nodes in a single compute when user don't
 need them

---
 examples/simple/simple.cpp | 28 ++++++++++++++++------------
 ggml-backend.c             | 21 ++++++++++++++-------
 ggml-backend.h             |  8 +++++++-
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index b3ae68492ecfd..dac7aa60afb1d 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -8,23 +8,27 @@
 
 // a function that can be called for every computed node during graph evaluation
 // the user can choose to whether to observe the data of the node depending on the tensor parameters
-static bool observe_compute(int node_index, struct ggml_tensor * t, void * user_data) {
+static bool observe_compute(int node_index, struct ggml_tensor * t, bool ask, void * user_data) {
     GGML_UNUSED(user_data);
 
-    // check if name contains soft_max
-    if (strstr(t->name, "soft_max") != 0) {
-        printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n",
-                __func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+    // the scheduler is asking us if we want to observe this node
+    if (ask) {
+        // check if name contains soft_max
+        return strstr(t->name, "soft_max") != 0;
+    }
 
-        std::vector<float> t_data(ggml_nelements(t));
-        ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t));
+    // print the node data
+    printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n",
+            __func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
 
-        // print first row
-        for (int i = 0; i < t->ne[0]; i++) {
-            printf("%8.4f ", t_data[i]);
-        }
-        printf("\n");
+    std::vector<float> t_data(ggml_nelements(t));
+    ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t));
+
+    // print first row
+    for (int i = 0; i < t->ne[0]; i++) {
+        printf("%8.4f ", t_data[i]);
     }
+    printf("\n");
 
     return true;
 }
diff --git a/ggml-backend.c b/ggml-backend.c
index ee78f45fa92bd..0ec46ed32fe81 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1337,18 +1337,25 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
             for (int j = 0; j < split->graph.n_nodes; j++) {
                 struct ggml_tensor * t = split->graph.nodes[j];
 
-                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, j + 1);
+                int k = j;
 
-                ggml_backend_graph_compute(split_backend, &gv);
-
-                if (ggml_is_view_op(t->op)) {
-                    continue;
+                // check if the user needs data from this node
+                while (!sched->callback_eval(k, t, true, sched->callback_eval_user_data) && k < split->graph.n_nodes - 1) {
+                    t = split->graph.nodes[++k];
                 }
 
-                // TODO: j is node index in the split, not in the original graph
-                if (!sched->callback_eval(j, t, sched->callback_eval_user_data)) {
+                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, k + 1);
+
+                ggml_backend_graph_compute(split_backend, &gv);
+
+                // TODO: k is node index in the split, not in the original graph
+                // TODO: avoid the ask == true call here
+                if (sched->callback_eval(k, t, true,  sched->callback_eval_user_data) &&
+                   !sched->callback_eval(k, t, false, sched->callback_eval_user_data)) {
                     break;
                 }
+
+                j = k;
             }
         }
         uint64_t compute_end_us = ggml_time_us();
diff --git a/ggml-backend.h b/ggml-backend.h
index 057ed120189c4..0d4ff69ba17a2 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -148,8 +148,14 @@ extern "C" {
     struct ggml_backend_sched;
     typedef struct ggml_backend_sched * ggml_backend_sched_t;
 
+    // when ask == true, the scheduler wants to know if the user wants to observe this node
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+    //
+    // when ask == false, the scheduler is passing the node tensor to the user for observation
+    // if the user returns false, the scheduler will cancel the graph compute
+    //
     // TODO: propose to rename to ggml_backend_sched_callback_eval
-    typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, void * user_data);
+    typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, bool ask, void * user_data);
 
     // Initialize a backend scheduler
     GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);

From 83f3d7a83c6eb9691db3f55477cccb3c9fd1cbab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 15 Jan 2024 15:52:41 +0200
Subject: [PATCH 3/9] backend : clean-up the implementation

ggml-ci
---
 examples/simple/simple.cpp | 11 ++++++-----
 ggml-backend.c             | 27 +++++++++++++++------------
 ggml-backend.h             |  4 +---
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index dac7aa60afb1d..ce3497345d0d7 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -8,19 +8,20 @@
 
 // a function that can be called for every computed node during graph evaluation
 // the user can choose to whether to observe the data of the node depending on the tensor parameters
-static bool observe_compute(int node_index, struct ggml_tensor * t, bool ask, void * user_data) {
+static bool observe_compute(struct ggml_tensor * t, bool ask, void * user_data) {
     GGML_UNUSED(user_data);
 
     // the scheduler is asking us if we want to observe this node
     if (ask) {
-        // check if name contains soft_max
+        // check if name contains soft_max (customize to your needs)
         return strstr(t->name, "soft_max") != 0;
     }
 
-    // print the node data
-    printf("%s: node_index = %5d, t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n",
-            __func__, node_index, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+    // print the node info
+    printf("%s: t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n",
+            __func__, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
 
+    // this will copy the data to host memory (if needed)
     std::vector<float> t_data(ggml_nelements(t));
     ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t));
 
diff --git a/ggml-backend.c b/ggml-backend.c
index 0ec46ed32fe81..07482bedf2ace 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1334,28 +1334,31 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
           //ggml_backend_synchronize(split_backend); // necessary to measure compute time
         } else {
             // similar to ggml_backend_compare_graph_backend
-            for (int j = 0; j < split->graph.n_nodes; j++) {
-                struct ggml_tensor * t = split->graph.nodes[j];
+            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
+                struct ggml_tensor * t = split->graph.nodes[j0];
 
-                int k = j;
+                int j1 = j0;
 
-                // check if the user needs data from this node
-                while (!sched->callback_eval(k, t, true, sched->callback_eval_user_data) && k < split->graph.n_nodes - 1) {
-                    t = split->graph.nodes[++k];
+                // determine the range [j0, j1] of nodes that can be computed together
+                while (j1 < split->graph.n_nodes - 1) {
+                    // check if the user needs data from this node
+                    if (sched->callback_eval(t, true, sched->callback_eval_user_data)) {
+                        break;
+                    }
+
+                    t = split->graph.nodes[++j1];
                 }
 
-                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j, k + 1);
+                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
 
                 ggml_backend_graph_compute(split_backend, &gv);
 
-                // TODO: k is node index in the split, not in the original graph
-                // TODO: avoid the ask == true call here
-                if (sched->callback_eval(k, t, true,  sched->callback_eval_user_data) &&
-                   !sched->callback_eval(k, t, false, sched->callback_eval_user_data)) {
+                if (sched->callback_eval(t, true,  sched->callback_eval_user_data) && // ask
+                   !sched->callback_eval(t, false, sched->callback_eval_user_data)) { // eval
                     break;
                 }
 
-                j = k;
+                j0 = j1;
             }
         }
         uint64_t compute_end_us = ggml_time_us();
diff --git a/ggml-backend.h b/ggml-backend.h
index 0d4ff69ba17a2..5cef4d8b475ea 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -154,8 +154,7 @@ extern "C" {
     // when ask == false, the scheduler is passing the node tensor to the user for observation
     // if the user returns false, the scheduler will cancel the graph compute
     //
-    // TODO: propose to rename to ggml_backend_sched_callback_eval
-    typedef bool (*ggml_backend_sched_eval_callback)(int node_index, struct ggml_tensor * t, bool ask, void * user_data);
+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
 
     // Initialize a backend scheduler
     GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
@@ -195,7 +194,6 @@ extern "C" {
     GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
     GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
 
-    // TODO: propose to rename this to ggml_backend_callback_compare
     typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
 
     // Compare the output of two backends

From e1b1db9f09fe73ca8460890b0c93c349685c54d7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 15 Jan 2024 16:42:16 +0200
Subject: [PATCH 4/9] simple : do not perform tensor data copy if not needed

---
 examples/simple/simple.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index ce3497345d0d7..8db37eef76901 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -22,12 +22,20 @@ static bool observe_compute(struct ggml_tensor * t, bool ask, void * user_data)
             __func__, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
 
     // this will copy the data to host memory (if needed)
-    std::vector<float> t_data(ggml_nelements(t));
-    ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t));
+    static std::vector<float> t_data;
+
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+    if (!is_host || ggml_is_contiguous(t)) {
+        t_data.resize(ggml_nelements(t));
+        ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t));
+    }
+
+    const float * data = is_host ? (const float *) t->data : t_data.data();
 
     // print first row
     for (int i = 0; i < t->ne[0]; i++) {
-        printf("%8.4f ", t_data[i]);
+        printf("%8.4f ", data[i]);
     }
     printf("\n");
 

From e0493800cecc2e18964f9c6d9db9db249bfca9c8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 15 Jan 2024 16:43:46 +0200
Subject: [PATCH 5/9] simple : fix

---
 examples/simple/simple.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 8db37eef76901..b83e7a812d1ca 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -26,7 +26,7 @@ static bool observe_compute(struct ggml_tensor * t, bool ask, void * user_data)
 
     const bool is_host = ggml_backend_buffer_is_host(t->buffer);
 
-    if (!is_host || ggml_is_contiguous(t)) {
+    if (!is_host || !ggml_is_contiguous(t)) {
         t_data.resize(ggml_nelements(t));
         ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t));
     }

From 0b2fca9a9f11b8bf12ec640fee7f365f525318ec Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 15 Jan 2024 16:18:11 +0200
Subject: [PATCH 6/9] imatrix : offload to GPU support

---
 examples/imatrix/imatrix.cpp | 61 ++++++++++++++++++++++++++++--------
 ggml.c                       | 14 ---------
 ggml.h                       |  6 ----
 3 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 1461bc96376a7..e0cb718b24c29 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -33,19 +33,43 @@ class IMatrixCollector {
 public:
     IMatrixCollector() = default;
     void set_parameters(StatParams&& params) { m_params = std::move(params); }
-    void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
+    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
     void save_imatrix() const;
 private:
     std::unordered_map<std::string, Stats> m_stats;
     StatParams                             m_params;
     std::mutex                             m_mutex;
     int                                    m_last_call = 0;
+    std::vector<float>                     m_src1_data;
 };
 
-void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
-    if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
+bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    GGML_UNUSED(user_data);
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
+    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
+    if (ask) {
+        if (t->op != GGML_OP_MUL_MAT) return false;
+        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
+        if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
+        return true;
+    }
+
     std::lock_guard<std::mutex> lock(m_mutex);
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
+
+    if (!is_host || !ggml_is_contiguous(src1)) {
+        m_src1_data.resize(ggml_nelements(src1));
+        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
+    }
+
+    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
+
     auto& e = m_stats[src0->name];
     if (e.values.empty()) {
         e.values.resize(src1->ne[0], 0);
@@ -59,7 +83,7 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st
         printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
     }
     for (int row = 0; row < (int)src1->ne[1]; ++row) {
-        const float * x = (const float *)src1->data + row * src1->ne[0];
+        const float * x = data + row * src1->ne[0];
         for (int j = 0; j < (int)src1->ne[0]; ++j) {
             e.values[j] += x[j]*x[j];
         }
@@ -70,6 +94,8 @@ void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const st
             save_imatrix();
         }
     }
+
+    return true;
 }
 
 void IMatrixCollector::save_imatrix() const {
@@ -93,8 +119,8 @@ void IMatrixCollector::save_imatrix() const {
 
 static IMatrixCollector g_collector;
 
-static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    g_collector.collect_imatrix(src0, src1);
+static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    return g_collector.collect_imatrix(t, ask, user_data);
 }
 
 
@@ -320,8 +346,6 @@ int main(int argc, char ** argv) {
 
     g_collector.set_parameters(std::move(sparams));
 
-    ggml_set_imatrix_collection(ik_collect_imatrix);
-
     params.logits_all = true;
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
@@ -340,16 +364,27 @@ int main(int argc, char ** argv) {
 
     llama_backend_init(params.numa);
 
-    llama_model * model;
-    llama_context * ctx;
+    llama_model_params mparams = llama_model_params_from_gpt_params(params);
 
-    // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
     if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return 1;
     }
 
+    llama_context_params cparams = llama_context_params_from_gpt_params(params);
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    cparams.cb_eval = ik_collect_imatrix;
+    cparams.cb_eval_user_data = NULL;
+
+    llama_context * ctx = llama_new_context_with_model(model, cparams);
+    if (ctx == NULL) {
+        fprintf(stderr, "%s: error: unable to create context\n", __func__);
+        return 1;
+    }
+
     const int n_ctx_train = llama_n_ctx_train(model);
     if (params.n_ctx > n_ctx_train) {
         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
diff --git a/ggml.c b/ggml.c
index ef5888ab21538..53197d4775417 100644
--- a/ggml.c
+++ b/ggml.c
@@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
 static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
 
-ggml_collect_imatrix_t g_imatrix_collect = NULL;
-
-void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
-    g_imatrix_collect = imatrix_collect;
-}
-
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I8] = {
         .type_name                = "i8",
@@ -9790,10 +9784,6 @@ static void ggml_compute_forward_mul_mat(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    if (ith == 1 && g_imatrix_collect) {
-        g_imatrix_collect(src0, src1);
-    }
-
     const enum ggml_type type = src0->type;
 
     const bool src1_cont = ggml_is_contiguous(src1);
@@ -10097,10 +10087,6 @@ static void ggml_compute_forward_mul_mat_id(
 
         const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
 
-        if (ith == 1 && g_imatrix_collect) {
-            g_imatrix_collect(src0_cur, src1);
-        }
-
         const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
         const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
diff --git a/ggml.h b/ggml.h
index 1187074f7f174..4a2746729c163 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2075,12 +2075,6 @@ extern "C" {
     GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
     GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
 
-    //
-    // Importance matrix
-    //
-    typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
-    GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
-
     //
     // gguf
     //

From a722d05a873dbec5d9c7a29a97c09c255aef1355 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 14:43:35 +0200
Subject: [PATCH 7/9] imatrix : fix ggml_mul_mat_id hanlding

ggml-ci
---
 examples/imatrix/imatrix.cpp | 95 +++++++++++++++++++++++++++---------
 1 file changed, 73 insertions(+), 22 deletions(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index e0cb718b24c29..af78711c5ab66 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -41,6 +41,7 @@ class IMatrixCollector {
     std::mutex                             m_mutex;
     int                                    m_last_call = 0;
     std::vector<float>                     m_src1_data;
+    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
 };
 
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -52,6 +53,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
     // when ask is true, the scheduler wants to know if we are interested in data from this tensor
     // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
     if (ask) {
+        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
         if (t->op != GGML_OP_MUL_MAT) return false;
         if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
         if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
@@ -63,35 +65,84 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
     // copy the data from the GPU memory if needed
     const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
 
-    if (!is_host || !ggml_is_contiguous(src1)) {
+    if (!is_host) {
         m_src1_data.resize(ggml_nelements(src1));
         ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
     }
 
     const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
 
-    auto& e = m_stats[src0->name];
-    if (e.values.empty()) {
-        e.values.resize(src1->ne[0], 0);
-    }
-    else if (e.values.size() != (size_t)src1->ne[0]) {
-        fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
-        exit(1); //GGML_ASSERT(false);
-    }
-    ++e.ncall;
-    if (m_params.verbosity > 1) {
-        printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
-    }
-    for (int row = 0; row < (int)src1->ne[1]; ++row) {
-        const float * x = data + row * src1->ne[0];
-        for (int j = 0; j < (int)src1->ne[0]; ++j) {
-            e.values[j] += x[j]*x[j];
+    if (t->op == GGML_OP_MUL_MAT_ID) {
+        const int idx  = ((int32_t *) t->op_params)[0];
+        const int n_as = ((int32_t *) t->op_params)[1];
+
+        // the top-k selected expert ids are stored in the src0 tensor
+        // for simplicity, always copy src0 to host, because it is small
+        // take into account that src0 is not contiguous!
+        GGML_ASSERT(src0->ne[1] == src1->ne[1]);
+        GGML_ASSERT(n_as*ggml_nrows(src0));
+        m_ids.resize(ggml_nbytes(src0)/sizeof(int));
+        ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
+
+        // loop over all possible experts, regardless if they are used or not in the batch
+        // this is necessary to guarantee equal number of "ncall" for each tensor
+        for (int ex = 0; ex < n_as; ++ex) {
+            src0 = t->src[2 + ex];
+            auto& e = m_stats[src0->name];
+            if (e.values.empty()) {
+                e.values.resize(src1->ne[0], 0);
+            }
+            else if (e.values.size() != (size_t)src1->ne[0]) {
+                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+                exit(1); //GGML_ASSERT(false);
+            }
+            // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
+            //       using the following line, we can correct for that if needed
+            //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
+            ++e.ncall;
+            if (m_params.verbosity > 1) {
+                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+            }
+            for (int row = 0; row < (int)src1->ne[1]; ++row) {
+                const int excur = m_ids[row*n_as + idx];
+                GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+                if (excur != ex) continue;
+                const float * x = data + row * src1->ne[0];
+                for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                    e.values[j] += x[j]*x[j];
+                }
+            }
+            if (e.ncall > m_last_call) {
+                m_last_call = e.ncall;
+                if (m_last_call % m_params.n_output_frequency == 0) {
+                    save_imatrix();
+                }
+            }
         }
-    }
-    if (e.ncall > m_last_call) {
-        m_last_call = e.ncall;
-        if (m_last_call % m_params.n_output_frequency == 0) {
-            save_imatrix();
+    } else {
+        auto& e = m_stats[src0->name];
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0], 0);
+        }
+        else if (e.values.size() != (size_t)src1->ne[0]) {
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+            exit(1); //GGML_ASSERT(false);
+        }
+        ++e.ncall;
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        }
+        for (int row = 0; row < (int)src1->ne[1]; ++row) {
+            const float * x = data + row * src1->ne[0];
+            for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                e.values[j] += x[j]*x[j];
+            }
+        }
+        if (e.ncall > m_last_call) {
+            m_last_call = e.ncall;
+            if (m_last_call % m_params.n_output_frequency == 0) {
+                save_imatrix();
+            }
         }
     }
 

From 10b25e0388de04aeaa141bd70648ddaa20904a70 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 15:10:38 +0200
Subject: [PATCH 8/9] ci : add imatrix test

ggml-ci
---
 ci/run.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/run.sh b/ci/run.sh
index 47a254f4cf1e8..08619f2693990 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -214,6 +214,8 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
     (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
@@ -241,6 +243,8 @@ function gg_run_open_llama_3b_v2 {
     check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
     check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
 
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
     # lora
     function compare_ppl {
         qnt="$1"
@@ -282,7 +286,6 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
     compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
 
-
     set +e
 }
 
@@ -304,6 +307,7 @@ function gg_sum_open_llama_3b_v2 {
     gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
     gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
     gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
     gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
     gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
@@ -391,6 +395,8 @@ function gg_run_open_llama_7b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
     (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
@@ -418,6 +424,8 @@ function gg_run_open_llama_7b_v2 {
     check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
     check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
 
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
     # lora
     function compare_ppl {
         qnt="$1"
@@ -481,6 +489,7 @@ function gg_sum_open_llama_7b_v2 {
     gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
     gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
     gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
     gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
     gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"

From 4fb52843bb149ff178c5ef4b268fe1f1cd3f49aa Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 15:27:34 +0200
Subject: [PATCH 9/9] ci : rearrange output

ggml-ci
---
 ci/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index 08619f2693990..1d678d45aacfe 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -295,6 +295,7 @@ function gg_sum_open_llama_3b_v2 {
     gg_printf 'OpenLLaMA 3B-v2:\n'
     gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
     gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
     gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
     gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
@@ -307,7 +308,6 @@ function gg_sum_open_llama_3b_v2 {
     gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
     gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
     gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
     gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
     gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
@@ -477,6 +477,7 @@ function gg_sum_open_llama_7b_v2 {
     gg_printf 'OpenLLaMA 7B-v2:\n'
     gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
     gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
     gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
     gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
@@ -489,7 +490,6 @@ function gg_sum_open_llama_7b_v2 {
     gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
     gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
     gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
     gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
     gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"