Skip to content

Commit 35f1e59

Browse files
ggerganovjordankanter
authored andcommitted
imatrix : offload to GPU support (ggml-org#4957)
* backend : add eval callback ggml-ci * backend : group nodes in a single compute when user don't need them * backend : clean-up the implementation ggml-ci * simple : do not perform tensor data copy if not needed * simple : fix * imatrix : offload to GPU support * imatrix : fix ggml_mul_mat_id hanlding ggml-ci * ci : add imatrix test ggml-ci * ci : rearrange output ggml-ci
1 parent ae1aae1 commit 35f1e59

File tree

4 files changed

+128
-53
lines changed

4 files changed

+128
-53
lines changed

ci/run.sh

+10-1
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,8 @@ function gg_run_open_llama_3b_v2 {
216216
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
217217
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
218218

219+
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
220+
219221
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
220222

221223
function check_ppl {
@@ -243,6 +245,8 @@ function gg_run_open_llama_3b_v2 {
243245
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
244246
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
245247

248+
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
249+
246250
# lora
247251
function compare_ppl {
248252
qnt="$1"
@@ -284,7 +288,6 @@ function gg_run_open_llama_3b_v2 {
284288
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
285289
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
286290

287-
288291
set +e
289292
}
290293

@@ -294,6 +297,7 @@ function gg_sum_open_llama_3b_v2 {
294297
gg_printf 'OpenLLaMA 3B-v2:\n'
295298
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
296299
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
300+
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
297301
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
298302
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
299303
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
@@ -393,6 +397,8 @@ function gg_run_open_llama_7b_v2 {
393397
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
394398
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
395399

400+
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
401+
396402
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
397403

398404
function check_ppl {
@@ -420,6 +426,8 @@ function gg_run_open_llama_7b_v2 {
420426
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
421427
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
422428

429+
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
430+
423431
# lora
424432
function compare_ppl {
425433
qnt="$1"
@@ -471,6 +479,7 @@ function gg_sum_open_llama_7b_v2 {
471479
gg_printf 'OpenLLaMA 7B-v2:\n'
472480
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
473481
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
482+
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
474483
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
475484
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
476485
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"

examples/imatrix/imatrix.cpp

+118-32
Original file line numberDiff line numberDiff line change
@@ -33,43 +33,120 @@ class IMatrixCollector {
3333
public:
3434
IMatrixCollector() = default;
3535
void set_parameters(StatParams&& params) { m_params = std::move(params); }
36-
void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
36+
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
3737
void save_imatrix() const;
3838
private:
3939
std::unordered_map<std::string, Stats> m_stats;
4040
StatParams m_params;
4141
std::mutex m_mutex;
4242
int m_last_call = 0;
43+
std::vector<float> m_src1_data;
44+
std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id
4345
};
4446

45-
void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
46-
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
47-
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
48-
std::lock_guard<std::mutex> lock(m_mutex);
49-
auto& e = m_stats[src0->name];
50-
if (e.values.empty()) {
51-
e.values.resize(src1->ne[0], 0);
52-
}
53-
else if (e.values.size() != (size_t)src1->ne[0]) {
54-
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
55-
exit(1); //GGML_ASSERT(false);
47+
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
48+
GGML_UNUSED(user_data);
49+
50+
const struct ggml_tensor * src0 = t->src[0];
51+
const struct ggml_tensor * src1 = t->src[1];
52+
53+
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
54+
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
55+
if (ask) {
56+
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
57+
if (t->op != GGML_OP_MUL_MAT) return false;
58+
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
59+
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
60+
return true;
5661
}
57-
++e.ncall;
58-
if (m_params.verbosity > 1) {
59-
printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
62+
63+
std::lock_guard<std::mutex> lock(m_mutex);
64+
65+
// copy the data from the GPU memory if needed
66+
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
67+
68+
if (!is_host) {
69+
m_src1_data.resize(ggml_nelements(src1));
70+
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
6071
}
61-
for (int row = 0; row < (int)src1->ne[1]; ++row) {
62-
const float * x = (const float *)src1->data + row * src1->ne[0];
63-
for (int j = 0; j < (int)src1->ne[0]; ++j) {
64-
e.values[j] += x[j]*x[j];
72+
73+
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
74+
75+
if (t->op == GGML_OP_MUL_MAT_ID) {
76+
const int idx = ((int32_t *) t->op_params)[0];
77+
const int n_as = ((int32_t *) t->op_params)[1];
78+
79+
// the top-k selected expert ids are stored in the src0 tensor
80+
// for simplicity, always copy src0 to host, because it is small
81+
// take into account that src0 is not contiguous!
82+
GGML_ASSERT(src0->ne[1] == src1->ne[1]);
83+
GGML_ASSERT(n_as*ggml_nrows(src0));
84+
m_ids.resize(ggml_nbytes(src0)/sizeof(int));
85+
ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
86+
87+
// loop over all possible experts, regardless if they are used or not in the batch
88+
// this is necessary to guarantee equal number of "ncall" for each tensor
89+
for (int ex = 0; ex < n_as; ++ex) {
90+
src0 = t->src[2 + ex];
91+
auto& e = m_stats[src0->name];
92+
if (e.values.empty()) {
93+
e.values.resize(src1->ne[0], 0);
94+
}
95+
else if (e.values.size() != (size_t)src1->ne[0]) {
96+
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
97+
exit(1); //GGML_ASSERT(false);
98+
}
99+
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
100+
// using the following line, we can correct for that if needed
101+
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
102+
++e.ncall;
103+
if (m_params.verbosity > 1) {
104+
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
105+
}
106+
for (int row = 0; row < (int)src1->ne[1]; ++row) {
107+
const int excur = m_ids[row*n_as + idx];
108+
GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
109+
if (excur != ex) continue;
110+
const float * x = data + row * src1->ne[0];
111+
for (int j = 0; j < (int)src1->ne[0]; ++j) {
112+
e.values[j] += x[j]*x[j];
113+
}
114+
}
115+
if (e.ncall > m_last_call) {
116+
m_last_call = e.ncall;
117+
if (m_last_call % m_params.n_output_frequency == 0) {
118+
save_imatrix();
119+
}
120+
}
65121
}
66-
}
67-
if (e.ncall > m_last_call) {
68-
m_last_call = e.ncall;
69-
if (m_last_call % m_params.n_output_frequency == 0) {
70-
save_imatrix();
122+
} else {
123+
auto& e = m_stats[src0->name];
124+
if (e.values.empty()) {
125+
e.values.resize(src1->ne[0], 0);
126+
}
127+
else if (e.values.size() != (size_t)src1->ne[0]) {
128+
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
129+
exit(1); //GGML_ASSERT(false);
130+
}
131+
++e.ncall;
132+
if (m_params.verbosity > 1) {
133+
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
134+
}
135+
for (int row = 0; row < (int)src1->ne[1]; ++row) {
136+
const float * x = data + row * src1->ne[0];
137+
for (int j = 0; j < (int)src1->ne[0]; ++j) {
138+
e.values[j] += x[j]*x[j];
139+
}
140+
}
141+
if (e.ncall > m_last_call) {
142+
m_last_call = e.ncall;
143+
if (m_last_call % m_params.n_output_frequency == 0) {
144+
save_imatrix();
145+
}
71146
}
72147
}
148+
149+
return true;
73150
}
74151

75152
void IMatrixCollector::save_imatrix() const {
@@ -93,8 +170,8 @@ void IMatrixCollector::save_imatrix() const {
93170

94171
static IMatrixCollector g_collector;
95172

96-
static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
97-
g_collector.collect_imatrix(src0, src1);
173+
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
174+
return g_collector.collect_imatrix(t, ask, user_data);
98175
}
99176

100177

@@ -320,8 +397,6 @@ int main(int argc, char ** argv) {
320397

321398
g_collector.set_parameters(std::move(sparams));
322399

323-
ggml_set_imatrix_collection(ik_collect_imatrix);
324-
325400
params.logits_all = true;
326401
params.n_batch = std::min(params.n_batch, params.n_ctx);
327402

@@ -340,16 +415,27 @@ int main(int argc, char ** argv) {
340415

341416
llama_backend_init(params.numa);
342417

343-
llama_model * model;
344-
llama_context * ctx;
418+
llama_model_params mparams = llama_model_params_from_gpt_params(params);
345419

346-
// load the model and apply lora adapter, if any
347-
std::tie(model, ctx) = llama_init_from_gpt_params(params);
420+
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
348421
if (model == NULL) {
349422
fprintf(stderr, "%s: error: unable to load model\n", __func__);
350423
return 1;
351424
}
352425

426+
llama_context_params cparams = llama_context_params_from_gpt_params(params);
427+
428+
// pass the callback to the backend scheduler
429+
// it will be executed for each node during the graph computation
430+
cparams.cb_eval = ik_collect_imatrix;
431+
cparams.cb_eval_user_data = NULL;
432+
433+
llama_context * ctx = llama_new_context_with_model(model, cparams);
434+
if (ctx == NULL) {
435+
fprintf(stderr, "%s: error: unable to create context\n", __func__);
436+
return 1;
437+
}
438+
353439
const int n_ctx_train = llama_n_ctx_train(model);
354440
if (params.n_ctx > n_ctx_train) {
355441
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",

ggml.c

-14
Original file line numberDiff line numberDiff line change
@@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
394394
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
395395
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
396396

397-
ggml_collect_imatrix_t g_imatrix_collect = NULL;
398-
399-
void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
400-
g_imatrix_collect = imatrix_collect;
401-
}
402-
403397
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
404398
[GGML_TYPE_I8] = {
405399
.type_name = "i8",
@@ -9790,10 +9784,6 @@ static void ggml_compute_forward_mul_mat(
97909784
const int ith = params->ith;
97919785
const int nth = params->nth;
97929786

9793-
if (ith == 1 && g_imatrix_collect) {
9794-
g_imatrix_collect(src0, src1);
9795-
}
9796-
97979787
const enum ggml_type type = src0->type;
97989788

97999789
const bool src1_cont = ggml_is_contiguous(src1);
@@ -10097,10 +10087,6 @@ static void ggml_compute_forward_mul_mat_id(
1009710087

1009810088
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
1009910089

10100-
if (ith == 1 && g_imatrix_collect) {
10101-
g_imatrix_collect(src0_cur, src1);
10102-
}
10103-
1010410090
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
1010510091
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
1010610092

ggml.h

-6
Original file line numberDiff line numberDiff line change
@@ -2085,12 +2085,6 @@ extern "C" {
20852085
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
20862086
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
20872087

2088-
//
2089-
// Importance matrix
2090-
//
2091-
typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
2092-
GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
2093-
20942088
//
20952089
// gguf
20962090
//

0 commit comments

Comments
 (0)