Skip to content

Commit 3c4dd01

Browse files
ikawrakowKawrakow
authored andcommitted
ggml : importance matrix support for legacy quants (ggml-org#4969)
* imatrix: adding support for legacy quants * imatrix: guard Q4_0/Q5_0 against ffn_down craziness --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 9f8f94e commit 3c4dd01

File tree

4 files changed

+226
-8
lines changed

4 files changed

+226
-8
lines changed

ggml-quants.c

+192
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
515515
quantize_row_q4_0_reference(x, y, k);
516516
}
517517

518+
518519
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
519520
const int qk = QK4_1;
520521

@@ -3039,6 +3040,197 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
30393040
return nrow * row_size;
30403041
}
30413042

3043+
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
3044+
static_assert(QK4_0 == 32, "QK4_0 must be 32");
3045+
3046+
if (!quant_weights) {
3047+
quantize_row_q4_0_reference(x, y, n_per_row);
3048+
return;
3049+
}
3050+
3051+
float weight[QK4_0];
3052+
int8_t L[QK4_0];
3053+
3054+
float sum_x2 = 0;
3055+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3056+
float sigma2 = sum_x2/n_per_row;
3057+
3058+
const int nb = n_per_row/QK4_0;
3059+
for (int ib = 0; ib < nb; ++ib) {
3060+
const float * xb = x + QK4_0 * ib;
3061+
const float * qw = quant_weights + QK4_0 * ib;
3062+
for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3063+
float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
3064+
y[ib].d = GGML_FP32_TO_FP16(d);
3065+
for (int j = 0; j < 16; ++j) {
3066+
y[ib].qs[j] = L[j] | (L[j+16] << 4);
3067+
}
3068+
}
3069+
}
3070+
3071+
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3072+
if (!quant_weights) {
3073+
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
3074+
}
3075+
int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3076+
char * qrow = (char *)dst;
3077+
for (int row = 0; row < nrow; ++row) {
3078+
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
3079+
src += n_per_row;
3080+
qrow += row_size;
3081+
}
3082+
return nrow * row_size;
3083+
}
3084+
3085+
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
3086+
static_assert(QK4_1 == 32, "QK4_1 must be 32");
3087+
3088+
if (!quant_weights) {
3089+
quantize_row_q4_1_reference(x, y, n_per_row);
3090+
return;
3091+
}
3092+
3093+
float weight[QK4_1];
3094+
uint8_t L[QK4_1], Laux[QK4_1];
3095+
3096+
float sum_x2 = 0;
3097+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3098+
float sigma2 = sum_x2/n_per_row;
3099+
3100+
const int nb = n_per_row/QK4_1;
3101+
for (int ib = 0; ib < nb; ++ib) {
3102+
const float * xb = x + QK4_1 * ib;
3103+
const float * qw = quant_weights + QK4_1 * ib;
3104+
for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3105+
float min;
3106+
float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
3107+
y[ib].d = GGML_FP32_TO_FP16(d);
3108+
y[ib].m = GGML_FP32_TO_FP16(-min);
3109+
for (int j = 0; j < 16; ++j) {
3110+
y[ib].qs[j] = L[j] | (L[j+16] << 4);
3111+
}
3112+
}
3113+
}
3114+
3115+
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3116+
if (!quant_weights) {
3117+
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
3118+
}
3119+
int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3120+
char * qrow = (char *)dst;
3121+
for (int row = 0; row < nrow; ++row) {
3122+
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
3123+
src += n_per_row;
3124+
qrow += row_size;
3125+
}
3126+
return nrow * row_size;
3127+
}
3128+
3129+
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
3130+
static_assert(QK5_0 == 32, "QK5_0 must be 32");
3131+
3132+
if (!quant_weights) {
3133+
quantize_row_q5_0_reference(x, y, n_per_row);
3134+
return;
3135+
}
3136+
3137+
float weight[QK5_0];
3138+
int8_t L[QK5_0];
3139+
3140+
float sum_x2 = 0;
3141+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3142+
float sigma2 = sum_x2/n_per_row;
3143+
3144+
const int nb = n_per_row/QK5_0;
3145+
for (int ib = 0; ib < nb; ++ib) {
3146+
const float * xb = x + QK5_0 * ib;
3147+
const float * qw = quant_weights + QK5_0 * ib;
3148+
for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3149+
float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
3150+
y[ib].d = GGML_FP32_TO_FP16(d);
3151+
3152+
uint32_t qh = 0;
3153+
3154+
for (int j = 0; j < 16; ++j) {
3155+
const uint8_t xi0 = L[j];
3156+
const uint8_t xi1 = L[j+16];
3157+
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
3158+
3159+
// get the 5-th bit and store it in qh at the right position
3160+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
3161+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
3162+
}
3163+
3164+
memcpy(&y[ib].qh, &qh, sizeof(qh));
3165+
}
3166+
}
3167+
3168+
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3169+
if (!quant_weights) {
3170+
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
3171+
}
3172+
int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3173+
char * qrow = (char *)dst;
3174+
for (int row = 0; row < nrow; ++row) {
3175+
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
3176+
src += n_per_row;
3177+
qrow += row_size;
3178+
}
3179+
return nrow * row_size;
3180+
}
3181+
3182+
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
3183+
static_assert(QK5_1 == 32, "QK5_1 must be 32");
3184+
3185+
if (!quant_weights) {
3186+
quantize_row_q5_1_reference(x, y, n_per_row);
3187+
return;
3188+
}
3189+
3190+
float weight[QK5_1];
3191+
uint8_t L[QK5_1], Laux[QK5_1];
3192+
3193+
float sum_x2 = 0;
3194+
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
3195+
float sigma2 = sum_x2/n_per_row;
3196+
3197+
const int nb = n_per_row/QK5_1;
3198+
for (int ib = 0; ib < nb; ++ib) {
3199+
const float * xb = x + QK5_1 * ib;
3200+
const float * qw = quant_weights + QK5_1 * ib;
3201+
for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
3202+
float min;
3203+
float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
3204+
y[ib].d = GGML_FP32_TO_FP16(d);
3205+
y[ib].m = GGML_FP32_TO_FP16(-min);
3206+
3207+
uint32_t qh = 0;
3208+
for (int j = 0; j < 16; ++j) {
3209+
const uint8_t xi0 = L[j];
3210+
const uint8_t xi1 = L[j+16];
3211+
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
3212+
// get the 5-th bit and store it in qh at the right position
3213+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
3214+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
3215+
}
3216+
memcpy(&y[ib].qh, &qh, sizeof(qh));
3217+
}
3218+
}
3219+
3220+
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3221+
if (!quant_weights) {
3222+
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
3223+
}
3224+
int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3225+
char * qrow = (char *)dst;
3226+
for (int row = 0; row < nrow; ++row) {
3227+
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
3228+
src += n_per_row;
3229+
qrow += row_size;
3230+
}
3231+
return nrow * row_size;
3232+
}
3233+
30423234
// ====================== "True" 2-bit (de)-quantization
30433235

30443236
static const uint64_t iq2xxs_grid[256] = {

ggml-quants.h

+4
Original file line numberDiff line numberDiff line change
@@ -253,3 +253,7 @@ size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row,
253253
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
254254
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
255255
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
256+
size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
257+
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
258+
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
259+
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);

ggml.c

+20-8
Original file line numberDiff line numberDiff line change
@@ -18674,26 +18674,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
1867418674
case GGML_TYPE_Q4_0:
1867518675
{
1867618676
GGML_ASSERT(start % QK4_0 == 0);
18677-
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
18678-
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
18677+
GGML_ASSERT(start % n_per_row == 0);
18678+
size_t start_row = start / n_per_row;
18679+
size_t row_size = ggml_row_size(type, n_per_row);
18680+
result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18681+
GGML_ASSERT(result == row_size * nrows);
1867918682
} break;
1868018683
case GGML_TYPE_Q4_1:
1868118684
{
1868218685
GGML_ASSERT(start % QK4_1 == 0);
18683-
block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
18684-
result = ggml_quantize_q4_1(src + start, block, n, n, hist);
18686+
GGML_ASSERT(start % n_per_row == 0);
18687+
size_t start_row = start / n_per_row;
18688+
size_t row_size = ggml_row_size(type, n_per_row);
18689+
result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18690+
GGML_ASSERT(result == row_size * nrows);
1868518691
} break;
1868618692
case GGML_TYPE_Q5_0:
1868718693
{
1868818694
GGML_ASSERT(start % QK5_0 == 0);
18689-
block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
18690-
result = ggml_quantize_q5_0(src + start, block, n, n, hist);
18695+
GGML_ASSERT(start % n_per_row == 0);
18696+
size_t start_row = start / n_per_row;
18697+
size_t row_size = ggml_row_size(type, n_per_row);
18698+
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18699+
GGML_ASSERT(result == row_size * nrows);
1869118700
} break;
1869218701
case GGML_TYPE_Q5_1:
1869318702
{
1869418703
GGML_ASSERT(start % QK5_1 == 0);
18695-
block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
18696-
result = ggml_quantize_q5_1(src + start, block, n, n, hist);
18704+
GGML_ASSERT(start % n_per_row == 0);
18705+
size_t start_row = start / n_per_row;
18706+
size_t row_size = ggml_row_size(type, n_per_row);
18707+
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18708+
GGML_ASSERT(result == row_size * nrows);
1869718709
} break;
1869818710
case GGML_TYPE_Q8_0:
1869918711
{

llama.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -8374,6 +8374,8 @@ struct quantize_state_internal {
83748374
int n_k_quantized = 0;
83758375
int n_fallback = 0;
83768376

8377+
bool has_imatrix = false;
8378+
83778379
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
83788380
: model(model)
83798381
, params(params)
@@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
85468548
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
85478549
new_type = GGML_TYPE_Q5_K;
85488550
}
8551+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
8552+
&& qs.has_imatrix && i_layer < n_layer/8) {
8553+
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
8554+
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
8555+
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
8556+
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
8557+
}
85498558
++qs.i_feed_forward_w2;
85508559
} else if (name.find("attn_output.weight") != std::string::npos) {
85518560
if (arch != LLM_ARCH_FALCON) {
@@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
86698678
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
86708679
if (imatrix_data) {
86718680
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
8681+
qs.has_imatrix = true;
86728682
}
86738683
}
86748684

0 commit comments

Comments
 (0)