Skip to content

Commit 36d19a6

Browse files
authored
Remove Q4_3 which is no better than Q5 (#1218)
1 parent 7f15c5c commit 36d19a6

11 files changed

+21
-359
lines changed

README.md

+18-19
Original file line numberDiff line numberDiff line change
@@ -281,30 +281,29 @@ When running the larger models, make sure you have enough disk space to store al
281281
282282
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
283283
284-
| model | original size | quantized size (4-bit) |
285-
|-------|---------------|------------------------|
286-
| 7B | 13 GB | 3.9 GB |
287-
| 13B | 24 GB | 7.8 GB |
288-
| 30B | 60 GB | 19.5 GB |
289-
| 65B | 120 GB | 38.5 GB |
284+
| Model | Original size | Quantized size (4-bit) |
285+
|------:|--------------:|-----------------------:|
286+
| 7B | 13 GB | 3.9 GB |
287+
| 13B | 24 GB | 7.8 GB |
288+
| 30B | 60 GB | 19.5 GB |
289+
| 65B | 120 GB | 38.5 GB |
290290
291291
### Quantization
292292
293293
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
294294
295-
Model | F16 | Q4_0 | Q4_1 | Q4_2 | Q4_3 | Q5_0 | Q5_1 | Q8_0
296-
-- | -- | -- | -- | -- | -- | -- | -- | --
297-
7B (ppl) | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0617 | 6.0139 | 5.9934 | 5.9571
298-
7B (size) | 13.0G | 4.0G | 4.8G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G
299-
7B (ms/tok @ 4th) | 128 | 56 | 61 | 84 | 91 | 91 | 95 | 75
300-
7B (ms/tok @ 8th) | 128 | 47 | 55 | 48 | 53 | 53 | 59 | 75
301-
7B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0
302-
-- | -- | -- | -- | -- | -- | -- | -- | --
303-
13B (ppl) | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.3234 | 5.2768 | 5.2582 | 5.2458
304-
13B (size) | 25.0G | 7.6G | 9.1G | 7.6G | 9.1G | 8.4G | 9.1G | 14G
305-
13B (ms/tok @ 4th) | 239 | 104 | 113 | 160 | 175 | 176 | 185 | 141
306-
13B (ms/tok @ 8th) | 240 | 85 | 99 | 97 | 114 | 108 | 117 | 147
307-
13B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0
295+
| Model | Measure | F16 | Q4_0 | Q4_1 | Q4_2 | Q5_0 | Q5_1 | Q8_0 |
296+
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
297+
| 7B | perplexity | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0139 | 5.9934 | 5.9571 |
298+
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.0G | 4.4G | 4.8G | 7.1G |
299+
| 7B | ms/tok @ 4th | 128 | 56 | 61 | 84 | 91 | 95 | 75 |
300+
| 7B | ms/tok @ 8th | 128 | 47 | 55 | 48 | 53 | 59 | 75 |
301+
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
302+
| 13B | perplexity | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.2768 | 5.2582 | 5.2458 |
303+
| 13B | file size | 25.0G | 7.6G | 9.1G | 7.6G | 8.4G | 9.1G | 14G |
304+
| 13B | ms/tok @ 4th | 239 | 104 | 113 | 160 | 176 | 185 | 141 |
305+
| 13B | ms/tok @ 8th | 240 | 85 | 99 | 97 | 108 | 117 | 147 |
306+
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
308307
309308
### Interactive mode
310309

SHA256SUMS

-4
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,13 @@
33
99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin
44
cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin
55
25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin
6-
3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b models/7B/ggml-model-q4_3.bin
76
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
87
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
98
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
109
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
1110
eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin
1211
d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin
1312
75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin
14-
4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3 models/13B/ggml-model-q4_3.bin
1513
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
1614
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
1715
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
@@ -21,7 +19,6 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/con
2119
517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin
2220
7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin
2321
aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin
24-
a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33 models/30B/ggml-model-q4_3.bin
2522
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
2623
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
2724
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
@@ -35,6 +32,5 @@ d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/con
3532
01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin
3633
4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin
3734
1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin
38-
305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c models/65B/ggml-model-q4_3.bin
3935
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
4036
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model

examples/quantize/quantize.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
99
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
1010
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
1111
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
12-
{"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3},
1312
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
1413
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
1514
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},

ggml-cuda.cu

-37
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,6 @@ typedef struct {
2929
} block_q4_2;
3030
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
3131

32-
#define QK4_3 16
33-
typedef struct {
34-
__half d; // delta
35-
__half m; // min
36-
uint8_t qs[QK4_3 / 2]; // nibbles / quants
37-
} block_q4_3;
38-
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
39-
4032
#define QK5_0 32
4133
typedef struct {
4234
__half d; // delta
@@ -131,30 +123,6 @@ static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
131123
}
132124
}
133125

134-
static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
135-
const block_q4_3 * x = (const block_q4_3 *) vx;
136-
137-
const int i = blockIdx.x;
138-
139-
const float d = x[i].d;
140-
const float m = x[i].m;
141-
142-
const uint8_t * pp = x[i].qs;
143-
144-
for (int l = 0; l < QK4_3; l += 2) {
145-
const uint8_t vi = pp[l/2];
146-
147-
const int8_t vi0 = vi & 0xf;
148-
const int8_t vi1 = vi >> 4;
149-
150-
const float v0 = vi0*d + m;
151-
const float v1 = vi1*d + m;
152-
153-
y[i*QK4_3 + l + 0] = v0;
154-
y[i*QK4_3 + l + 1] = v1;
155-
}
156-
}
157-
158126
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
159127
const block_q5_0 * x = (const block_q5_0 *) vx;
160128

@@ -244,11 +212,6 @@ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t st
244212
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
245213
}
246214

247-
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
248-
const int nb = k / QK4_3;
249-
dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
250-
}
251-
252215
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
253216
const int nb = k / QK5_0;
254217
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);

ggml-cuda.h

-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ void ggml_cuda_pool_free(void * ptr, size_t size);
3434
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
3535
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
3636
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
37-
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
3837
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
3938
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
4039
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);

ggml-opencl-dequant.cl

-21
Original file line numberDiff line numberDiff line change
@@ -60,25 +60,4 @@ __kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global f
6060
result[index + 1] = ((vi >> 4) - 8)*d;
6161
}
6262

63-
struct block_q4_3
64-
{
65-
ushort d;
66-
ushort m;
67-
uchar qs[8];
68-
};
69-
70-
__kernel void dequantize_row_q4_3(__global struct block_q4_3* blocks, __global float* result) {
71-
const uint i = get_global_id(0) / 16;
72-
const uint l = get_local_id(0);
73-
74-
const float d = vload_half(0, (__global half*) &(blocks[i].d));
75-
const float m = vload_half(0, (__global half*) &(blocks[i].m));
76-
77-
const uchar vi = blocks[i].qs[l];
78-
79-
const uint index = i*16 + l*2;
80-
result[index + 0] = (vi & 0xf) * d + m;
81-
result[index + 1] = (vi >> 4) * d + m;
82-
}
83-
8463
);

ggml-opencl.c

+1-9
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ static cl_device_id device;
2424
static cl_context context;
2525
static cl_command_queue queue;
2626
static cl_program program;
27-
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3;
27+
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2;
2828
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
2929
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
3030

@@ -97,8 +97,6 @@ void ggml_cl_init(void) {
9797
CL_CHECK(err, "clCreateKernel");
9898
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
9999
CL_CHECK(err, "clCreateKernel");
100-
kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err);
101-
CL_CHECK(err, "clCreateKernel");
102100
}
103101

104102
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
@@ -150,12 +148,6 @@ void ggml_cl_sgemm_wrapper(
150148
local = 8;
151149
size_qb = global * (sizeof(short) + local) / 16;
152150
break;
153-
case GGML_TYPE_Q4_3:
154-
dequant = true;
155-
kernel = kernel_q4_3;
156-
local = 8;
157-
size_qb = global * (sizeof(short) * 2 + local) / 16;
158-
break;
159151
default:
160152
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
161153
abort();

0 commit comments

Comments
 (0)