Skip to content

Commit d39765b

Browse files
ikawrakowKawrakow
authored andcommitted
2-bit quantizations (ggml-org#4897)
* imatrix: load * imatrix: WIP * imatrix: Add Q2_K quantization * imatrix: also guard against Q2_K_S quantization without importance matrix * imatrix: guard even more against low-bit quantization misuse --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 2282a3d commit d39765b

File tree

9 files changed

+1149
-82
lines changed

9 files changed

+1149
-82
lines changed

Diff for: examples/benchmark/benchmark-matmult.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
194194
// Set up a the benchmark matrices
195195
// printf("Creating new tensor q11 & Running quantize\n");
196196
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
197-
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
197+
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
198198

199199
// Set up a the compute graph
200200
// printf("Creating new tensor q31\n");
@@ -207,7 +207,7 @@ int main(int argc, char ** argv) {
207207
// Set up a second graph computation to make sure we override the CPU cache lines
208208
// printf("Creating new tensor q12 & Running quantize\n");
209209
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
210-
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
210+
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
211211

212212
// printf("Creating new tensor q32\n");
213213
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);

Diff for: examples/quantize/quantize.cpp

+131-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
#include <cstring>
66
#include <vector>
77
#include <string>
8+
#include <unordered_map>
9+
#include <fstream>
10+
#include <cmath>
11+
#include <algorithm>
812

913
struct quant_option {
1014
std::string name;
@@ -17,6 +21,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
1721
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
1822
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
1923
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
24+
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
25+
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
2026
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
2127
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
2228
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
@@ -72,22 +78,108 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
7278
//
7379
[[noreturn]]
7480
static void usage(const char * executable) {
75-
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
81+
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
7682
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
7783
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
7884
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
85+
printf(" --imatrixfile_name: use data in file_name as importance matrix for quant optimizations\n");
86+
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
87+
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
88+
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
7989
printf("\nAllowed quantization types:\n");
8090
for (auto & it : QUANT_OPTIONS) {
8191
if (it.name != "COPY") {
8292
printf(" %2d or ", it.ftype);
8393
} else {
8494
printf(" ");
8595
}
86-
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
96+
printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str());
8797
}
8898
exit(1);
8999
}
90100

101+
static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
102+
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
103+
if (!in) {
104+
printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
105+
return;
106+
}
107+
int n_entries;
108+
in.read((char*)&n_entries, sizeof(n_entries));
109+
if (in.fail() || n_entries < 1) {
110+
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
111+
return;
112+
}
113+
for (int i = 0; i < n_entries; ++i) {
114+
int len; in.read((char *)&len, sizeof(len));
115+
std::vector<char> name_as_vec(len+1);
116+
in.read((char *)name_as_vec.data(), len);
117+
if (in.fail()) {
118+
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
119+
return;
120+
}
121+
name_as_vec[len] = 0;
122+
std::string name{name_as_vec.data()};
123+
auto& e = imatrix_data[std::move(name)];
124+
int ncall;
125+
in.read((char*)&ncall, sizeof(ncall));
126+
int nval;
127+
in.read((char *)&nval, sizeof(nval));
128+
if (in.fail() || nval < 1) {
129+
printf("%s: failed reading number of values for entry %d\n",__func__,i);
130+
imatrix_data = {};
131+
return;
132+
}
133+
e.resize(nval);
134+
in.read((char*)e.data(), nval*sizeof(float));
135+
if (in.fail()) {
136+
printf("%s: failed reading data for entry %d\n",__func__,i);
137+
imatrix_data = {};
138+
return;
139+
}
140+
if (ncall > 0) {
141+
for (auto& v : e) v /= ncall;
142+
}
143+
}
144+
printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
145+
}
146+
147+
static void prepare_imatrix(const std::string& imatrix_file,
148+
const std::vector<std::string>& included_weights,
149+
const std::vector<std::string>& excluded_weights,
150+
std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
151+
if (!imatrix_file.empty()) {
152+
load_imatrix(imatrix_file, imatrix_data);
153+
}
154+
if (imatrix_data.empty()) {
155+
return;
156+
}
157+
if (!excluded_weights.empty()) {
158+
for (auto& name : excluded_weights) {
159+
for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) {
160+
auto pos = it->first.find(name);
161+
if (pos != std::string::npos) it = imatrix_data.erase(it);
162+
else ++it;
163+
}
164+
}
165+
}
166+
if (!included_weights.empty()) {
167+
std::unordered_map<std::string, std::vector<float>> tmp;
168+
for (auto& name : included_weights) {
169+
for (auto& e : imatrix_data) {
170+
auto pos = e.first.find(name);
171+
if (pos != std::string::npos) {
172+
tmp.emplace(std::move(e));
173+
}
174+
}
175+
}
176+
imatrix_data = std::move(tmp);
177+
}
178+
if (!imatrix_data.empty()) {
179+
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
180+
}
181+
}
182+
91183
int main(int argc, char ** argv) {
92184
if (argc < 3) {
93185
usage(argv[0]);
@@ -96,6 +188,8 @@ int main(int argc, char ** argv) {
96188
llama_model_quantize_params params = llama_model_quantize_default_params();
97189

98190
int arg_idx = 1;
191+
std::string imatrix_file;
192+
std::vector<std::string> included_weights, excluded_weights;
99193

100194
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
101195
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -104,15 +198,43 @@ int main(int argc, char ** argv) {
104198
params.allow_requantize = true;
105199
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
106200
params.pure = true;
201+
} else if (strcmp(argv[arg_idx], "--imatrix") == 0) {
202+
if (arg_idx < argc-1) {
203+
imatrix_file = argv[++arg_idx];
204+
} else {
205+
usage(argv[0]);
206+
}
207+
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
208+
if (arg_idx < argc-1) {
209+
included_weights.push_back(argv[++arg_idx]);
210+
} else {
211+
usage(argv[0]);
212+
}
213+
} else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
214+
if (arg_idx < argc-1) {
215+
excluded_weights.push_back(argv[++arg_idx]);
216+
} else {
217+
usage(argv[0]);
218+
}
107219
} else {
108220
usage(argv[0]);
109221
}
110222
}
111223

112224
if (argc - arg_idx < 2) {
225+
printf("%s: bad arguments\n", argv[0]);
226+
usage(argv[0]);
227+
}
228+
if (!included_weights.empty() && !excluded_weights.empty()) {
113229
usage(argv[0]);
114230
}
115231

232+
std::unordered_map<std::string, std::vector<float>> imatrix_data;
233+
prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
234+
if (!imatrix_data.empty()) {
235+
params.imatrix = &imatrix_data;
236+
}
237+
116238
llama_backend_init(false);
117239

118240
// parse command line arguments
@@ -163,6 +285,13 @@ int main(int argc, char ** argv) {
163285
}
164286
}
165287

288+
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) {
289+
fprintf(stderr, "\n===============================================================================================\n");
290+
fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
291+
fprintf(stderr, "===============================================================================================\n\n\n");
292+
return 1;
293+
}
294+
166295
print_build_info();
167296

168297
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());

0 commit comments

Comments
 (0)