Skip to content

Commit 179ad0f

Browse files
committed
Little rework of the difquant formulas
1 parent 644aa9f commit 179ad0f

File tree

1 file changed

+26
-26
lines changed

1 file changed

+26
-26
lines changed

src/llama.cpp

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15866,17 +15866,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1586615866
const llm_arch arch = qs.model.arch;
1586715867
const auto tn = LLM_TN(arch);
1586815868

15869-
// difquant_init_tensors has a broad 12.5% bump to the upper quant.
15870-
auto difquant_init_tensors = [](int i_layer, int n_layers) -> bool {
15871-
return i_layer <= n_layers/8;
15869+
// difquant_first_last_tensors has a broad 13.75-16.66% bump to the upper quant.
15870+
auto difquant_first_last_tensors = [](int i_layer, int n_layers) -> bool {
15871+
return i_layer < n_layers/8 || i_layer >= n_layers-2;
1587215872
};
15873-
// difquant_init_end_tensors has a broad 25% bump to the upper quant.
15874-
auto difquant_init_end_tensors = [](int i_layer, int n_layers) -> bool {
15875-
return i_layer <= n_layers/8 || i_layer > 7*n_layers/8;
15873+
// difquant_more_fl_tensors has a broad 26-29% bump to the upper quant.
15874+
auto difquant_more_fl_tensors = [](int i_layer, int n_layers) -> bool {
15875+
return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8;
1587615876
};
1587715877
// difquant_three_eights_tensors has a broad 37.5% bump to the upper quant.
1587815878
auto difquant_three_eights_tensors = [](int i_layer, int n_layers) -> bool {
15879-
return i_layer <= n_layers/8 || i_layer > 7*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
15879+
return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer > 2*n_layers/8 && i_layer < 3*n_layers/8);
1588015880
};
1588115881
// original formula use_more_bits :
1588215882
// return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
@@ -16028,8 +16028,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1602816028
}
1602916029
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1603016030
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16031-
new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16032-
else new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16031+
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16032+
else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1603316033
}
1603416034
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
1603516035
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
@@ -16102,8 +16102,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1610216102
}
1610316103
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1610416104
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
16105-
new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16106-
else new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16105+
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16106+
else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1610716107
}
1610816108
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1610916109
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16115,8 +16115,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1611516115
}
1611616116
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1611716117
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16118-
new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16119-
else new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16118+
new_type = difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16119+
else new_type = difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1612016120
}
1612116121
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
1612216122
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16199,7 +16199,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1619916199
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
1620016200
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
1620116201
}
16202-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16202+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1620316203
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
1620416204
if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS;
1620516205
}
@@ -16213,22 +16213,22 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1621316213
}
1621416214
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
1621516215
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16216-
new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16216+
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1621716217
}
1621816218
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
1621916219
new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1622016220
}
1622116221
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16222-
new_type = difquant_init_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16222+
new_type = difquant_first_last_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1622316223
}
1622416224
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1622516225
new_type = difquant_five_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1622616226
}
1622716227
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16228-
new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16228+
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1622916229
}
1623016230
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16231-
new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16231+
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1623216232
}
1623316233
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
1623416234
new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
@@ -16331,18 +16331,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1633116331
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
1633216332
int i_layer = info.first, n_layer = info.second;
1633316333
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
16334-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
16334+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
1633516335
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1633616336
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1633716337
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1633816338
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
1633916339
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
1634016340
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
16341-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16341+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1634216342
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16343-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16343+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_first_last_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1634416344
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16345-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16345+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1634616346
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1634716347
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1634816348
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
@@ -16351,18 +16351,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1635116351
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1635216352
int i_layer = info.first, n_layer = info.second;
1635316353
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
16354-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
16354+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
1635516355
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1635616356
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1635716357
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1635816358
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
1635916359
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
1636016360
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
16361-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16361+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1636216362
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16363-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16363+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_first_last_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1636416364
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16365-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16365+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1636616366
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1636716367
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1636816368
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;

0 commit comments

Comments
 (0)