Skip to content

Commit f9968a5

Browse files
committed
ggml : remove Q4_2 mode
1 parent 4ab39d2 commit f9968a5

File tree

7 files changed

+3
-361
lines changed

7 files changed

+3
-361
lines changed

examples/quantize/quantize.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
1010
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
1111
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
12-
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
1312
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
1413
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
1514
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},

ggml-cuda.cu

-37
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,6 @@ typedef struct {
4949
} block_q4_1;
5050
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
5151

52-
#define QK4_2 16
53-
typedef struct {
54-
half d; // delta
55-
uint8_t qs[QK4_2 / 2]; // nibbles / quants
56-
} block_q4_2;
57-
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
58-
5952
#define QK5_0 32
6053
typedef struct {
6154
half d; // delta
@@ -117,29 +110,6 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
117110
}
118111
}
119112

120-
static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
121-
const block_q4_2 * x = (const block_q4_2 *) vx;
122-
123-
const int i = blockIdx.x;
124-
125-
const float d = x[i].d;
126-
127-
const uint8_t * pp = x[i].qs;
128-
129-
for (int l = 0; l < QK4_2; l += 2) {
130-
const uint8_t vi = pp[l/2];
131-
132-
const int8_t vi0 = vi & 0xf;
133-
const int8_t vi1 = vi >> 4;
134-
135-
const float v0 = (vi0 - 8)*d;
136-
const float v1 = (vi1 - 8)*d;
137-
138-
y[i*QK4_2 + l + 0] = v0;
139-
y[i*QK4_2 + l + 1] = v1;
140-
}
141-
}
142-
143113
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
144114
static const int qk = QK5_0;
145115

@@ -215,11 +185,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
215185
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
216186
}
217187

218-
static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
219-
const int nb = k / QK4_2;
220-
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
221-
}
222-
223188
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
224189
const int nb = k / QK5_0;
225190
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
@@ -254,8 +219,6 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
254219
return dequantize_row_q4_0_cuda;
255220
case GGML_TYPE_Q4_1:
256221
return dequantize_row_q4_1_cuda;
257-
case GGML_TYPE_Q4_2:
258-
return dequantize_row_q4_2_cuda;
259222
case GGML_TYPE_Q5_0:
260223
return dequantize_row_q5_0_cuda;
261224
case GGML_TYPE_Q5_1:

ggml-opencl.c

+1-29
Original file line numberDiff line numberDiff line change
@@ -52,26 +52,6 @@ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global f
5252
result[index + 1] = (vi >> 4) * d + m;
5353
}
5454

55-
struct block_q4_2
56-
{
57-
ushort d;
58-
uchar qs[8];
59-
};
60-
61-
__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
62-
const uint i = get_global_id(0) / 16;
63-
const uint l = get_local_id(0);
64-
65-
const float d = vload_half(0, (__global half*) &blocks[i].d);
66-
67-
const uchar vi = blocks[i].qs[l];
68-
69-
const uint index = i*16 + l*2;
70-
result[index + 0] = ((vi & 0xf) - 8)*d;
71-
result[index + 1] = ((vi >> 4) - 8)*d;
72-
}
73-
74-
7555
struct block_q5_0
7656
{
7757
float d;
@@ -167,7 +147,7 @@ static cl_device_id device;
167147
static cl_context context;
168148
static cl_command_queue queue;
169149
static cl_program program;
170-
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
150+
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
171151
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
172152
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
173153

@@ -238,8 +218,6 @@ void ggml_cl_init(void) {
238218
CL_CHECK(err, "clCreateKernel");
239219
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
240220
CL_CHECK(err, "clCreateKernel");
241-
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
242-
CL_CHECK(err, "clCreateKernel");
243221
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
244222
CL_CHECK(err, "clCreateKernel");
245223
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
@@ -292,12 +270,6 @@ void ggml_cl_sgemm_wrapper(
292270
local = 16;
293271
size_qb = global * (sizeof(float) * 2 + local) / 32;
294272
break;
295-
case GGML_TYPE_Q4_2:
296-
dequant = true;
297-
kernel = kernel_q4_2;
298-
local = 8;
299-
size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
300-
break;
301273
case GGML_TYPE_Q5_0:
302274
dequant = true;
303275
kernel = kernel_q5_0;

0 commit comments

Comments
 (0)