@@ -33,43 +33,120 @@ class IMatrixCollector {
33
33
public:
34
34
IMatrixCollector () = default ;
35
35
void set_parameters (StatParams&& params) { m_params = std::move (params); }
36
- void collect_imatrix (const struct ggml_tensor * src0, const struct ggml_tensor * src1 );
36
+ bool collect_imatrix (struct ggml_tensor * t, bool ask, void * user_data );
37
37
void save_imatrix () const ;
38
38
private:
39
39
std::unordered_map<std::string, Stats> m_stats;
40
40
StatParams m_params;
41
41
std::mutex m_mutex;
42
42
int m_last_call = 0 ;
43
+ std::vector<float > m_src1_data;
44
+ std::vector<int > m_ids; // the expert ids from ggml_mul_mat_id
43
45
};
44
46
45
- void IMatrixCollector::collect_imatrix (const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
46
- if (src1->ne [1 ] < 16 || src1->type != GGML_TYPE_F32) return ;
47
- if (!(strncmp (src0->name , " blk." , 4 ) == 0 || (m_params.collect_output_weight && strcmp (src0->name , " output.weight" ) == 0 ))) return ;
48
- std::lock_guard<std::mutex> lock (m_mutex);
49
- auto & e = m_stats[src0->name ];
50
- if (e.values .empty ()) {
51
- e.values .resize (src1->ne [0 ], 0 );
52
- }
53
- else if (e.values .size () != (size_t )src1->ne [0 ]) {
54
- fprintf (stderr, " Oops: inconsistent size for %s (%d vs %d)\n " , src0->name , (int )e.values .size (), (int )src1->ne [0 ]);
55
- exit (1 ); // GGML_ASSERT(false);
47
+ bool IMatrixCollector::collect_imatrix (struct ggml_tensor * t, bool ask, void * user_data) {
48
+ GGML_UNUSED (user_data);
49
+
50
+ const struct ggml_tensor * src0 = t->src [0 ];
51
+ const struct ggml_tensor * src1 = t->src [1 ];
52
+
53
+ // when ask is true, the scheduler wants to know if we are interested in data from this tensor
54
+ // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
55
+ if (ask) {
56
+ if (t->op == GGML_OP_MUL_MAT_ID) return true ; // collect all indirect matrix multiplications
57
+ if (t->op != GGML_OP_MUL_MAT) return false ;
58
+ if (src1->ne [1 ] < 16 || src1->type != GGML_TYPE_F32) return false ;
59
+ if (!(strncmp (src0->name , " blk." , 4 ) == 0 || (m_params.collect_output_weight && strcmp (src0->name , " output.weight" ) == 0 ))) return false ;
60
+ return true ;
56
61
}
57
- ++e.ncall ;
58
- if (m_params.verbosity > 1 ) {
59
- printf (" %s[%d]: %s, %d x %d, %d\n " ,__func__,m_last_call,src0->name ,(int )src1->ne [0 ],(int )src1->ne [1 ],(int )src1->type );
62
+
63
+ std::lock_guard<std::mutex> lock (m_mutex);
64
+
65
+ // copy the data from the GPU memory if needed
66
+ const bool is_host = ggml_backend_buffer_is_host (src1->buffer );
67
+
68
+ if (!is_host) {
69
+ m_src1_data.resize (ggml_nelements (src1));
70
+ ggml_backend_tensor_get (src1, m_src1_data.data (), 0 , ggml_nbytes (src1));
60
71
}
61
- for (int row = 0 ; row < (int )src1->ne [1 ]; ++row) {
62
- const float * x = (const float *)src1->data + row * src1->ne [0 ];
63
- for (int j = 0 ; j < (int )src1->ne [0 ]; ++j) {
64
- e.values [j] += x[j]*x[j];
72
+
73
+ const float * data = is_host ? (const float *) src1->data : m_src1_data.data ();
74
+
75
+ if (t->op == GGML_OP_MUL_MAT_ID) {
76
+ const int idx = ((int32_t *) t->op_params )[0 ];
77
+ const int n_as = ((int32_t *) t->op_params )[1 ];
78
+
79
+ // the top-k selected expert ids are stored in the src0 tensor
80
+ // for simplicity, always copy src0 to host, because it is small
81
+ // take into account that src0 is not contiguous!
82
+ GGML_ASSERT (src0->ne [1 ] == src1->ne [1 ]);
83
+ GGML_ASSERT (n_as*ggml_nrows (src0));
84
+ m_ids.resize (ggml_nbytes (src0)/sizeof (int ));
85
+ ggml_backend_tensor_get (src0, m_ids.data (), 0 , ggml_nbytes (src0));
86
+
87
+ // loop over all possible experts, regardless if they are used or not in the batch
88
+ // this is necessary to guarantee equal number of "ncall" for each tensor
89
+ for (int ex = 0 ; ex < n_as; ++ex) {
90
+ src0 = t->src [2 + ex];
91
+ auto & e = m_stats[src0->name ];
92
+ if (e.values .empty ()) {
93
+ e.values .resize (src1->ne [0 ], 0 );
94
+ }
95
+ else if (e.values .size () != (size_t )src1->ne [0 ]) {
96
+ fprintf (stderr, " Oops: inconsistent size for %s (%d vs %d)\n " , src0->name , (int )e.values .size (), (int )src1->ne [0 ]);
97
+ exit (1 ); // GGML_ASSERT(false);
98
+ }
99
+ // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
100
+ // using the following line, we can correct for that if needed
101
+ // if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
102
+ ++e.ncall ;
103
+ if (m_params.verbosity > 1 ) {
104
+ printf (" %s[%d]: %32s, %s, %5d x %5d, %d\n " , __func__, m_last_call, src0->name , ggml_op_name (t->op ), (int )src1->ne [0 ], (int )src1->ne [1 ], (int )src1->type );
105
+ }
106
+ for (int row = 0 ; row < (int )src1->ne [1 ]; ++row) {
107
+ const int excur = m_ids[row*n_as + idx];
108
+ GGML_ASSERT (excur >= 0 && excur < n_as); // sanity check
109
+ if (excur != ex) continue ;
110
+ const float * x = data + row * src1->ne [0 ];
111
+ for (int j = 0 ; j < (int )src1->ne [0 ]; ++j) {
112
+ e.values [j] += x[j]*x[j];
113
+ }
114
+ }
115
+ if (e.ncall > m_last_call) {
116
+ m_last_call = e.ncall ;
117
+ if (m_last_call % m_params.n_output_frequency == 0 ) {
118
+ save_imatrix ();
119
+ }
120
+ }
65
121
}
66
- }
67
- if (e.ncall > m_last_call) {
68
- m_last_call = e.ncall ;
69
- if (m_last_call % m_params.n_output_frequency == 0 ) {
70
- save_imatrix ();
122
+ } else {
123
+ auto & e = m_stats[src0->name ];
124
+ if (e.values .empty ()) {
125
+ e.values .resize (src1->ne [0 ], 0 );
126
+ }
127
+ else if (e.values .size () != (size_t )src1->ne [0 ]) {
128
+ fprintf (stderr, " Oops: inconsistent size for %s (%d vs %d)\n " , src0->name , (int )e.values .size (), (int )src1->ne [0 ]);
129
+ exit (1 ); // GGML_ASSERT(false);
130
+ }
131
+ ++e.ncall ;
132
+ if (m_params.verbosity > 1 ) {
133
+ printf (" %s[%d]: %32s, %s, %5d x %5d, %d\n " , __func__, m_last_call, src0->name , ggml_op_name (t->op ), (int )src1->ne [0 ], (int )src1->ne [1 ], (int )src1->type );
134
+ }
135
+ for (int row = 0 ; row < (int )src1->ne [1 ]; ++row) {
136
+ const float * x = data + row * src1->ne [0 ];
137
+ for (int j = 0 ; j < (int )src1->ne [0 ]; ++j) {
138
+ e.values [j] += x[j]*x[j];
139
+ }
140
+ }
141
+ if (e.ncall > m_last_call) {
142
+ m_last_call = e.ncall ;
143
+ if (m_last_call % m_params.n_output_frequency == 0 ) {
144
+ save_imatrix ();
145
+ }
71
146
}
72
147
}
148
+
149
+ return true ;
73
150
}
74
151
75
152
void IMatrixCollector::save_imatrix () const {
@@ -93,8 +170,8 @@ void IMatrixCollector::save_imatrix() const {
93
170
94
171
static IMatrixCollector g_collector;
95
172
96
- static void ik_collect_imatrix (const struct ggml_tensor * src0, const struct ggml_tensor * src1 ) {
97
- g_collector.collect_imatrix (src0, src1 );
173
+ static bool ik_collect_imatrix (struct ggml_tensor * t, bool ask, void * user_data ) {
174
+ return g_collector.collect_imatrix (t, ask, user_data );
98
175
}
99
176
100
177
@@ -320,8 +397,6 @@ int main(int argc, char ** argv) {
320
397
321
398
g_collector.set_parameters (std::move (sparams));
322
399
323
- ggml_set_imatrix_collection (ik_collect_imatrix);
324
-
325
400
params.logits_all = true ;
326
401
params.n_batch = std::min (params.n_batch , params.n_ctx );
327
402
@@ -340,16 +415,27 @@ int main(int argc, char ** argv) {
340
415
341
416
llama_backend_init (params.numa );
342
417
343
- llama_model * model;
344
- llama_context * ctx;
418
+ llama_model_params mparams = llama_model_params_from_gpt_params (params);
345
419
346
- // load the model and apply lora adapter, if any
347
- std::tie (model, ctx) = llama_init_from_gpt_params (params);
420
+ llama_model * model = llama_load_model_from_file (params.model .c_str (), mparams);
348
421
if (model == NULL ) {
349
422
fprintf (stderr, " %s: error: unable to load model\n " , __func__);
350
423
return 1 ;
351
424
}
352
425
426
+ llama_context_params cparams = llama_context_params_from_gpt_params (params);
427
+
428
+ // pass the callback to the backend scheduler
429
+ // it will be executed for each node during the graph computation
430
+ cparams.cb_eval = ik_collect_imatrix;
431
+ cparams.cb_eval_user_data = NULL ;
432
+
433
+ llama_context * ctx = llama_new_context_with_model (model, cparams);
434
+ if (ctx == NULL ) {
435
+ fprintf (stderr, " %s: error: unable to create context\n " , __func__);
436
+ return 1 ;
437
+ }
438
+
353
439
const int n_ctx_train = llama_n_ctx_train (model);
354
440
if (params.n_ctx > n_ctx_train) {
355
441
fprintf (stderr, " %s: warning: model was trained on only %d context tokens (%d specified)\n " ,
0 commit comments