Skip to content

Commit 859fee6

Browse files
authored
quantize : use map to assign quantization type from string (#1191)
instead of `int` (while `int` option still being supported) This allows the following usage: `./quantize ggml-model-f16.bin ggml-model-q4_0.bin q4_0` instead of: `./quantize ggml-model-f16.bin ggml-model-q4_0.bin 2`
1 parent 4afcc37 commit 859fee6

File tree

3 files changed

+27
-9
lines changed

3 files changed

+27
-9
lines changed

.devops/tools.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
2323
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
2424
else
2525
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
26-
./quantize "$i" "${i/f16/q4_0}" 2
26+
./quantize "$i" "${i/f16/q4_0}" q4_0
2727
fi
2828
done
2929
else

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,8 @@ python3 -m pip install -r requirements.txt
203203
# convert the 7B model to ggml FP16 format
204204
python3 convert.py models/7B/
205205
206-
# quantize the model to 4-bits (using method 2 = q4_0)
207-
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
206+
# quantize the model to 4-bits (using q4_0 method)
207+
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
208208
209209
# run the inference
210210
./main -m ./models/7B/ggml-model-q4_0.bin -n 128

examples/quantize/quantize.cpp

+24-6
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,17 @@
22
#include "llama.h"
33

44
#include <cstdio>
5+
#include <map>
56
#include <string>
67

8+
static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
9+
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
10+
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
11+
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
12+
{"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3},
13+
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
14+
};
15+
716
// usage:
817
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
918
//
@@ -12,11 +21,9 @@ int main(int argc, char ** argv) {
1221

1322
if (argc < 4) {
1423
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
15-
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
16-
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
17-
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
18-
fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
19-
fprintf(stderr, " type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0);
24+
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
25+
fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
26+
}
2027
return 1;
2128
}
2229

@@ -30,7 +37,18 @@ int main(int argc, char ** argv) {
3037
const std::string fname_inp = argv[1];
3138
const std::string fname_out = argv[2];
3239

33-
const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
40+
enum llama_ftype ftype;
41+
if (argv[3][0] == 'q') {
42+
auto it = LLAMA_FTYPE_MAP.find(argv[3]);
43+
if (it == LLAMA_FTYPE_MAP.end()) {
44+
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
45+
return 1;
46+
}
47+
ftype = it->second;
48+
} else {
49+
ftype = (enum llama_ftype)atoi(argv[3]);
50+
}
51+
3452
int nthread = argc > 4 ? atoi(argv[4]) : 0;
3553

3654
const int64_t t_main_start_us = ggml_time_us();

0 commit comments

Comments
 (0)