5
5
#include < cstring>
6
6
#include < vector>
7
7
#include < string>
8
+ #include < unordered_map>
9
+ #include < fstream>
10
+ #include < cmath>
11
+ #include < algorithm>
8
12
9
13
struct quant_option {
10
14
std::string name;
@@ -17,6 +21,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
17
21
{ " Q4_1" , LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B" , },
18
22
{ " Q5_0" , LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B" , },
19
23
{ " Q5_1" , LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B" , },
24
+ { " IQ2_XXS" ,LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization" , },
25
+ { " IQ2_XS" , LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization" , },
20
26
{ " Q2_K" , LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B" , },
21
27
{ " Q2_K_S" , LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B" , },
22
28
{ " Q3_K" , LLAMA_FTYPE_MOSTLY_Q3_K_M, " alias for Q3_K_M" },
@@ -72,22 +78,108 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
72
78
//
73
79
[[noreturn]]
74
80
static void usage (const char * executable) {
75
- printf (" usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n " , executable);
81
+ printf (" usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n " , executable);
76
82
printf (" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n " );
77
83
printf (" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n " );
78
84
printf (" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n " );
85
+ printf (" --imatrixfile_name: use data in file_name as importance matrix for quant optimizations\n " );
86
+ printf (" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n " );
87
+ printf (" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n " );
88
+ printf (" Note: --include-weights and --exclude-weights cannot be used together\n " );
79
89
printf (" \n Allowed quantization types:\n " );
80
90
for (auto & it : QUANT_OPTIONS) {
81
91
if (it.name != " COPY" ) {
82
92
printf (" %2d or " , it.ftype );
83
93
} else {
84
94
printf (" " );
85
95
}
86
- printf (" %-6s : %s\n " , it.name .c_str (), it.desc .c_str ());
96
+ printf (" %-7s : %s\n " , it.name .c_str (), it.desc .c_str ());
87
97
}
88
98
exit (1 );
89
99
}
90
100
101
+ static void load_imatrix (const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float >>& imatrix_data) {
102
+ std::ifstream in (imatrix_file.c_str (), std::ios::binary);
103
+ if (!in) {
104
+ printf (" %s: failed to open %s\n " ,__func__,imatrix_file.c_str ());
105
+ return ;
106
+ }
107
+ int n_entries;
108
+ in.read ((char *)&n_entries, sizeof (n_entries));
109
+ if (in.fail () || n_entries < 1 ) {
110
+ printf (" %s: no data in file %s\n " , __func__, imatrix_file.c_str ());
111
+ return ;
112
+ }
113
+ for (int i = 0 ; i < n_entries; ++i) {
114
+ int len; in.read ((char *)&len, sizeof (len));
115
+ std::vector<char > name_as_vec (len+1 );
116
+ in.read ((char *)name_as_vec.data (), len);
117
+ if (in.fail ()) {
118
+ printf (" %s: failed reading name for entry %d from %s\n " ,__func__,i+1 ,imatrix_file.c_str ());
119
+ return ;
120
+ }
121
+ name_as_vec[len] = 0 ;
122
+ std::string name{name_as_vec.data ()};
123
+ auto & e = imatrix_data[std::move (name)];
124
+ int ncall;
125
+ in.read ((char *)&ncall, sizeof (ncall));
126
+ int nval;
127
+ in.read ((char *)&nval, sizeof (nval));
128
+ if (in.fail () || nval < 1 ) {
129
+ printf (" %s: failed reading number of values for entry %d\n " ,__func__,i);
130
+ imatrix_data = {};
131
+ return ;
132
+ }
133
+ e.resize (nval);
134
+ in.read ((char *)e.data (), nval*sizeof (float ));
135
+ if (in.fail ()) {
136
+ printf (" %s: failed reading data for entry %d\n " ,__func__,i);
137
+ imatrix_data = {};
138
+ return ;
139
+ }
140
+ if (ncall > 0 ) {
141
+ for (auto & v : e) v /= ncall;
142
+ }
143
+ }
144
+ printf (" %s: loaded %d importance matrix entries from %s\n " ,__func__,int (imatrix_data.size ()),imatrix_file.c_str ());
145
+ }
146
+
147
+ static void prepare_imatrix (const std::string& imatrix_file,
148
+ const std::vector<std::string>& included_weights,
149
+ const std::vector<std::string>& excluded_weights,
150
+ std::unordered_map<std::string, std::vector<float >>& imatrix_data) {
151
+ if (!imatrix_file.empty ()) {
152
+ load_imatrix (imatrix_file, imatrix_data);
153
+ }
154
+ if (imatrix_data.empty ()) {
155
+ return ;
156
+ }
157
+ if (!excluded_weights.empty ()) {
158
+ for (auto & name : excluded_weights) {
159
+ for (auto it = imatrix_data.begin (); it != imatrix_data.end (); ) {
160
+ auto pos = it->first .find (name);
161
+ if (pos != std::string::npos) it = imatrix_data.erase (it);
162
+ else ++it;
163
+ }
164
+ }
165
+ }
166
+ if (!included_weights.empty ()) {
167
+ std::unordered_map<std::string, std::vector<float >> tmp;
168
+ for (auto & name : included_weights) {
169
+ for (auto & e : imatrix_data) {
170
+ auto pos = e.first .find (name);
171
+ if (pos != std::string::npos) {
172
+ tmp.emplace (std::move (e));
173
+ }
174
+ }
175
+ }
176
+ imatrix_data = std::move (tmp);
177
+ }
178
+ if (!imatrix_data.empty ()) {
179
+ printf (" %s: have %d importance matrix entries\n " , __func__, int (imatrix_data.size ()));
180
+ }
181
+ }
182
+
91
183
int main (int argc, char ** argv) {
92
184
if (argc < 3 ) {
93
185
usage (argv[0 ]);
@@ -96,6 +188,8 @@ int main(int argc, char ** argv) {
96
188
llama_model_quantize_params params = llama_model_quantize_default_params ();
97
189
98
190
int arg_idx = 1 ;
191
+ std::string imatrix_file;
192
+ std::vector<std::string> included_weights, excluded_weights;
99
193
100
194
for (; arg_idx < argc && strncmp (argv[arg_idx], " --" , 2 ) == 0 ; arg_idx++) {
101
195
if (strcmp (argv[arg_idx], " --leave-output-tensor" ) == 0 ) {
@@ -104,15 +198,43 @@ int main(int argc, char ** argv) {
104
198
params.allow_requantize = true ;
105
199
} else if (strcmp (argv[arg_idx], " --pure" ) == 0 ) {
106
200
params.pure = true ;
201
+ } else if (strcmp (argv[arg_idx], " --imatrix" ) == 0 ) {
202
+ if (arg_idx < argc-1 ) {
203
+ imatrix_file = argv[++arg_idx];
204
+ } else {
205
+ usage (argv[0 ]);
206
+ }
207
+ } else if (strcmp (argv[arg_idx], " --include-weights" ) == 0 ) {
208
+ if (arg_idx < argc-1 ) {
209
+ included_weights.push_back (argv[++arg_idx]);
210
+ } else {
211
+ usage (argv[0 ]);
212
+ }
213
+ } else if (strcmp (argv[arg_idx], " --exclude-weights" ) == 0 ) {
214
+ if (arg_idx < argc-1 ) {
215
+ excluded_weights.push_back (argv[++arg_idx]);
216
+ } else {
217
+ usage (argv[0 ]);
218
+ }
107
219
} else {
108
220
usage (argv[0 ]);
109
221
}
110
222
}
111
223
112
224
if (argc - arg_idx < 2 ) {
225
+ printf (" %s: bad arguments\n " , argv[0 ]);
226
+ usage (argv[0 ]);
227
+ }
228
+ if (!included_weights.empty () && !excluded_weights.empty ()) {
113
229
usage (argv[0 ]);
114
230
}
115
231
232
+ std::unordered_map<std::string, std::vector<float >> imatrix_data;
233
+ prepare_imatrix (imatrix_file, included_weights, excluded_weights, imatrix_data);
234
+ if (!imatrix_data.empty ()) {
235
+ params.imatrix = &imatrix_data;
236
+ }
237
+
116
238
llama_backend_init (false );
117
239
118
240
// parse command line arguments
@@ -163,6 +285,13 @@ int main(int argc, char ** argv) {
163
285
}
164
286
}
165
287
288
+ if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty ()) {
289
+ fprintf (stderr, " \n ===============================================================================================\n " );
290
+ fprintf (stderr, " Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n " );
291
+ fprintf (stderr, " ===============================================================================================\n\n\n " );
292
+ return 1 ;
293
+ }
294
+
166
295
print_build_info ();
167
296
168
297
fprintf (stderr, " %s: quantizing '%s' to '%s' as %s" , __func__, fname_inp.c_str (), fname_out.c_str (), ftype_str.c_str ());
0 commit comments