Skip to content

Commit ff48606

Browse files
committed
IQI_XL, IQ2_S, IQ2_XS enhanced
1 parent 8a1ab24 commit ff48606

File tree

1 file changed

+105
-5
lines changed

1 file changed

+105
-5
lines changed

src/llama.cpp

Lines changed: 105 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17548,6 +17548,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1754817548
auto difquant_six_eights_tensors = [](int i_layer, int n_layers) -> bool {
1754917549
return i_layer <= n_layers/8 || i_layer > 4*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
1755017550
};
17551+
// difquant_all_tensors has a broad 100% bump to the upper quant. Ex : 32/32. This, for easy mass edit purpose during tests.
17552+
auto difquant_all_tensors = [](int i_layer, int n_layers) -> bool {
17553+
return i_layer <= n_layers;
17554+
};
1755117555
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
1755217556
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
1755317557
if (n_expert > 1) {
@@ -17709,7 +17713,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1770917713
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1771017714
else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1771117715
}
17712-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
17716+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
17717+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1771317718
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1771417719
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
1771517720
else if (qs.model.hparams.n_expert >= 8)
@@ -17728,6 +17733,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1772817733
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1772917734
else new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1773017735
}
17736+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
17737+
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
17738+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
17739+
else if (qs.model.hparams.n_expert >= 8)
17740+
new_type = difquant_all_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17741+
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
17742+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17743+
else if (qs.model.hparams.n_expert >= 4)
17744+
new_type = difquant_all_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17745+
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
17746+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17747+
else if (qs.model.hparams.n_expert >= 2)
17748+
new_type = difquant_all_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
17749+
else if (qs.model.hparams.n_gqa() >= 4)
17750+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
17751+
else if (qs.model.hparams.n_gqa() >= 2)
17752+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17753+
else new_type = difquant_all_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17754+
}
1773117755
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1773217756
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1773317757
new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
@@ -17857,7 +17881,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1785717881
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1785817882
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
1785917883
}
17860-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
17884+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
17885+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1786117886
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1786217887
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
1786317888
else if (qs.model.hparams.n_expert >= 8)
@@ -17876,6 +17901,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1787617901
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1787717902
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
1787817903
}
17904+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
17905+
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
17906+
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
17907+
else if (qs.model.hparams.n_expert >= 8)
17908+
new_type = difquant_all_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
17909+
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
17910+
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17911+
else if (qs.model.hparams.n_expert >= 4)
17912+
new_type = difquant_all_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17913+
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
17914+
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17915+
else if (qs.model.hparams.n_expert >= 2)
17916+
new_type = difquant_all_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
17917+
else if (qs.model.hparams.n_gqa() >= 4)
17918+
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
17919+
else if (qs.model.hparams.n_gqa() >= 2)
17920+
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
17921+
else new_type = difquant_all_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
17922+
}
1787917923
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1788017924
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1788117925
new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
@@ -18045,7 +18089,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1804518089
new_type = difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
1804618090
else new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
1804718091
}
18048-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
18092+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
18093+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1804918094
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1805018095
new_type = difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1805118096
else if (qs.model.hparams.n_expert >= 8)
@@ -18062,6 +18107,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1806218107
new_type = difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
1806318108
else new_type = difquant_half_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
1806418109
}
18110+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
18111+
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
18112+
new_type = difquant_six_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18113+
else if (qs.model.hparams.n_expert >= 8)
18114+
new_type = difquant_all_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18115+
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
18116+
new_type = difquant_six_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18117+
else if (qs.model.hparams.n_expert >= 4)
18118+
new_type = difquant_all_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18119+
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
18120+
new_type = difquant_six_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
18121+
else if (qs.model.hparams.n_expert >= 2)
18122+
new_type = difquant_all_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
18123+
else if (qs.model.hparams.n_gqa() >= 2)
18124+
new_type = difquant_six_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
18125+
else new_type = difquant_all_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
18126+
}
1806518127
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1806618128
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1806718129
new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -18288,7 +18350,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1828818350
new_type = difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
1828918351
else new_type = difquant_fl_more_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS;
1829018352
}
18291-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
18353+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
18354+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1829218355
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1829318356
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1829418357
else if (qs.model.hparams.n_expert >= 8)
@@ -18305,6 +18368,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1830518368
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1830618369
else new_type = difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1830718370
}
18371+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
18372+
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
18373+
new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18374+
else if (qs.model.hparams.n_expert >= 8)
18375+
new_type = difquant_all_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18376+
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
18377+
new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18378+
else if (qs.model.hparams.n_expert >= 4)
18379+
new_type = difquant_all_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18380+
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
18381+
new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18382+
else if (qs.model.hparams.n_expert >= 2)
18383+
new_type = difquant_all_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18384+
else if (qs.model.hparams.n_gqa() >= 2)
18385+
new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
18386+
else new_type = difquant_all_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
18387+
}
1830818388
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1830918389
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1831018390
new_type = difquant_fl_more_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
@@ -18409,7 +18489,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1840918489
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1841018490
else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1841118491
}
18412-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ) {
18492+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
18493+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1841318494
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1841418495
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
1841518496
else if (qs.model.hparams.n_expert >= 8)
@@ -18428,6 +18509,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1842818509
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1842918510
else new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1843018511
}
18512+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
18513+
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
18514+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
18515+
else if (qs.model.hparams.n_expert >= 8)
18516+
new_type = difquant_all_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
18517+
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
18518+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18519+
else if (qs.model.hparams.n_expert >= 4)
18520+
new_type = difquant_all_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18521+
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
18522+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18523+
else if (qs.model.hparams.n_expert >= 2)
18524+
new_type = difquant_all_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
18525+
else if (qs.model.hparams.n_gqa() >= 4)
18526+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
18527+
else if (qs.model.hparams.n_gqa() >= 2)
18528+
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18529+
else new_type = difquant_all_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18530+
}
1843118531
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1843218532
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1843318533
new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;

0 commit comments

Comments
 (0)