Skip to content

Commit 4eecee5

Browse files
committed
iq1_m: small PPL improvement via super-block scale adjustment
After quantizing block scales redo the super-block scale fit. PPL(LLaMA-v2-7B ) = 9.3346 PPL(LLaMA-v2-13B) = 6.8419 PPL(LLaMA-v2-70B) = 4.8294 PPL(Mistral-7B ) = 8.1624
1 parent 0fe61f1 commit 4eecee5

File tree

1 file changed

+22
-2
lines changed

1 file changed

+22
-2
lines changed

ggml-quants.c

+22-2
Original file line numberDiff line numberDiff line change
@@ -11992,6 +11992,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
1199211992
float sumqx[4], sumq2[4];
1199311993

1199411994
iq1m_scale_t s;
11995+
const float * xx;
1199511996

1199611997
for (int ibl = 0; ibl < nbl; ++ibl) {
1199711998

@@ -12126,7 +12127,6 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
1212612127
scale = -scale;
1212712128
best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
1212812129
}
12129-
const float * xx;
1213012130
bool all_on_grid = true;
1213112131
for (int k = 0; k < block_size/8; ++k) {
1213212132
if (k == 0) xx = best_k < 2 ? x_p : x_m;
@@ -12173,13 +12173,33 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
1217312173
uint16_t * sc = (uint16_t *)y[ibl].scales;
1217412174
float d = max_scale/15;
1217512175
float id = 1/d;
12176+
float sumqx_f = 0, sumq2_f = 0;
1217612177
for (int ib = 0; ib < QK_K/block_size; ++ib) {
1217712178
int l = nearest_int(0.5f*(id*scales[ib+0]-1));
1217812179
l = MAX(0, MIN(7, l));
1217912180
sc[ib/4] |= (l << 3*(ib%4));
1218012181
y[ibl].qh[ib] |= masks[shifts[ib]];
12182+
const float * xb = xbl + block_size*ib;
12183+
if (quant_weights) {
12184+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
12185+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12186+
} else {
12187+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
12188+
}
12189+
for (int k = 0; k < block_size/8; ++k) {
12190+
if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
12191+
else xx = shifts[ib]%2 == 0 ? x_p : x_m;
12192+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
12193+
for (int j = 0; j < 8; ++j) {
12194+
float w = weight[8*k + j];
12195+
float q = xx[(pg[j] - 1)/2]*(2*l+1);
12196+
sumqx_f += w*q*xb[8*k+j];
12197+
sumq2_f += w*q*q;
12198+
}
12199+
}
1218112200
}
12182-
s.fp16 = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
12201+
if (sumq2_f > 0) d = sumqx_f/sumq2_f;
12202+
s.fp16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
1218312203
sc[0] |= ((s.u16 & 0x000f) << 12);
1218412204
sc[1] |= ((s.u16 & 0x00f0) << 8);
1218512205
sc[2] |= ((s.u16 & 0x0f00) << 4);

0 commit comments

Comments
 (0)