Skip to content

Commit 86a7e4a

Browse files
committed
IQ3_UXL
1 parent 97fbd74 commit 86a7e4a

File tree

4 files changed

+71
-22
lines changed

4 files changed

+71
-22
lines changed

examples/quantize/quantize.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,12 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
3636
{ "Q2_K_L", LLAMA_FTYPE_MOSTLY_Q2_K_L, " 3.20G, +3.1836 ppl @ Llama-3-8B", },
3737
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
3838
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
39-
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.70 bpw quantization mix", },
40-
{ "IQ3_ML", LLAMA_FTYPE_MOSTLY_IQ3_ML, " 3.80 bpw quantization mix", },
41-
{ "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.90 bpw quantization mix", },
42-
{ "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 4.00 bpw quantization mix", },
43-
{ "IQ3_XXXL", LLAMA_FTYPE_MOSTLY_IQ3_XXXL, " 4.10 bpw quantization mix", },
39+
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.60 bpw quantization mix", },
40+
{ "IQ3_ML", LLAMA_FTYPE_MOSTLY_IQ3_ML, " 3.75 bpw quantization mix", },
41+
{ "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.85 bpw quantization mix", },
42+
{ "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 3.95 bpw quantization mix", },
43+
{ "IQ3_XXXL", LLAMA_FTYPE_MOSTLY_IQ3_XXXL, " 4.05 bpw quantization mix", },
44+
{ "IQ3_UXL", LLAMA_FTYPE_MOSTLY_IQ3_UXL, " 4.15 bpw quantization mix", },
4445
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
4546
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
4647
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1405,6 +1405,7 @@ class LlamaFileType(IntEnum):
14051405
MOSTLY_Q3_K_XL = 45 # except 1d tensors
14061406
MOSTLY_IQ3_ML = 46 # except 1d tensors
14071407
MOSTLY_IQ3_XXXL = 47 # except 1d tensors
1408+
MOSTLY_IQ3_UXL = 48 # except 1d tensors
14081409

14091410
GUESSED = 1024 # not specified in the model file
14101411

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ extern "C" {
184184
LLAMA_FTYPE_MOSTLY_Q3_K_XL = 45, // except 1d tensors
185185
LLAMA_FTYPE_MOSTLY_IQ3_ML = 46, // except 1d tensors
186186
LLAMA_FTYPE_MOSTLY_IQ3_XXXL = 47, // except 1d tensors
187+
LLAMA_FTYPE_MOSTLY_IQ3_UXL = 48, // except 1d tensors
187188
LLAMA_FTYPE_CQS = 99, // except 1d tensors
188189

189190
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file

src/llama.cpp

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5239,11 +5239,12 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
52395239
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
52405240
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
52415241
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
5242-
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw";
5243-
case LLAMA_FTYPE_MOSTLY_IQ3_ML: return "IQ3_S mix - 3.80 bpw";
5244-
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.90 bpw";
5245-
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 4.00 bpw";
5246-
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.10 bpw";
5242+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.60 bpw";
5243+
case LLAMA_FTYPE_MOSTLY_IQ3_ML: return "IQ3_S mix - 3.75 bpw";
5244+
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.85 bpw";
5245+
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw";
5246+
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.05 bpw";
5247+
case LLAMA_FTYPE_MOSTLY_IQ3_UXL: return "IQ3_S mix - 4.15 bpw";
52475248
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
52485249
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
52495250
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
@@ -17648,7 +17649,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1764817649
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS ||
1764917650
ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
1765017651
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML ||
17651-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ4_XS;
17652+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) new_type = GGML_TYPE_IQ4_XS;
1765217653
}
1765317654
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
1765417655
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q2_K;
@@ -17673,7 +17674,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1767317674
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_XXS;
1767417675
else new_type = GGML_TYPE_IQ3_S;
1767517676
}
17676-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
17677+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ||
17678+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
1767717679
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_S;
1767817680
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_XS;
1767917681
}
@@ -17698,7 +17700,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1769817700
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M ||
1769917701
ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K ||
1770017702
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS ||
17701-
ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
17703+
ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
1770217704
new_type = GGML_TYPE_Q8_0;
1770317705
}
1770417706
else if (new_type != GGML_TYPE_Q8_0) new_type = GGML_TYPE_Q6_K;
@@ -17856,6 +17858,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1785617858
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1785717859
else new_type = GGML_TYPE_Q5_K;
1785817860
}
17861+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
17862+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17863+
new_type = difquant_seven_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17864+
else new_type = GGML_TYPE_Q5_K;
17865+
}
1785917866
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1786017867
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1786117868
// new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -17870,7 +17877,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1787017877
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1787117878
// else new_type = GGML_TYPE_Q4_K;
1787217879
// }
17873-
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
17880+
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ||
17881+
// ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
1787417882
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1787517883
// else new_type = GGML_TYPE_Q4_K;
1787617884
// }
@@ -17903,7 +17911,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1790317911
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M ||
1790417912
ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K ||
1790517913
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS ||
17906-
ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
17914+
ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
1790717915
new_type = GGML_TYPE_Q8_0;
1790817916
}
1790917917
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@@ -18097,6 +18105,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1809718105
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1809818106
else new_type = GGML_TYPE_IQ4_XS;
1809918107
}
18108+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18109+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18110+
new_type = difquant_seven_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18111+
else new_type = GGML_TYPE_IQ4_XS;
18112+
}
1810018113
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1810118114
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1810218115
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
@@ -18254,9 +18267,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1825418267
// new_type = difquant_six_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1825518268
// else new_type = GGML_TYPE_IQ3_S;
1825618269
// }
18257-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
18258-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL ||
18259-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
18270+
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18271+
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18272+
// new_type = difquant_seven_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18273+
// else new_type = GGML_TYPE_IQ3_S;
18274+
// }
18275+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
18276+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL ||
18277+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
1826018278
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
1826118279
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1826218280
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS;
@@ -18406,6 +18424,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1840618424
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1840718425
else new_type = GGML_TYPE_IQ4_XS;
1840818426
}
18427+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18428+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18429+
new_type = (difquant_seven_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18430+
else new_type = GGML_TYPE_IQ4_XS;
18431+
}
1840918432
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
1841018433
new_type = GGML_TYPE_Q5_K;
1841118434
}
@@ -18518,7 +18541,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1851818541
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS ||
1851918542
ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
1852018543
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS ||
18521-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
18544+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
1852218545
new_type = GGML_TYPE_Q5_K;
1852318546
}
1852418547
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
@@ -18582,10 +18605,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1858218605
new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1858318606
else new_type = GGML_TYPE_IQ4_XS;
1858418607
}
18608+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18609+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18610+
new_type = difquant_seven_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18611+
else new_type = GGML_TYPE_IQ4_XS;
18612+
}
1858518613
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS)
1858618614
new_type = GGML_TYPE_IQ3_S;
1858718615
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
18588-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ3_S;
18616+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL)
18617+
new_type = GGML_TYPE_IQ3_S;
1858918618
} else {
1859018619
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K;
1859118620
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL)
@@ -18756,10 +18785,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1875618785
new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1875718786
else new_type = GGML_TYPE_Q5_K;
1875818787
}
18788+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18789+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18790+
new_type = difquant_seven_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18791+
else new_type = GGML_TYPE_Q5_K;
18792+
}
1875918793
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
1876018794
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML)
1876118795
// new_type = GGML_TYPE_IQ4_XS;
18762-
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL)
18796+
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ||
18797+
// ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL)
1876318798
// new_type = GGML_TYPE_Q4_K;
1876418799
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
1876518800
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
@@ -18884,6 +18919,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1888418919
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1888518920
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1888618921
}
18922+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18923+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18924+
new_type = (difquant_seven_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18925+
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18926+
}
1888718927
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1888818928
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1888918929
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
@@ -19004,6 +19044,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1900419044
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1900519045
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1900619046
}
19047+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
19048+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19049+
new_type = (difquant_seven_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19050+
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19051+
}
1900719052
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1900819053
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1900919054
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
@@ -19168,8 +19213,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1916819213
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1916919214
case LLAMA_FTYPE_MOSTLY_IQ3_ML: default_type = GGML_TYPE_IQ3_S; break;
1917019215
case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
19171-
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: default_type = GGML_TYPE_IQ3_S; break;
19216+
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: default_type = GGML_TYPE_IQ3_S; break;
1917219217
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: default_type = GGML_TYPE_IQ3_S; break;
19218+
case LLAMA_FTYPE_MOSTLY_IQ3_UXL: default_type = GGML_TYPE_IQ3_S; break;
1917319219
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
1917419220
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
1917519221
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;

0 commit comments

Comments
 (0)