Skip to content

Commit ddb1373

Browse files
committed
IQ3_XXL and IQ3_XXXL
We now have a full range of quants between IQ3_M and IQ4_XS
1 parent a79633b commit ddb1373

File tree

4 files changed

+58
-17
lines changed

4 files changed

+58
-17
lines changed

examples/quantize/quantize.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
3535
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
3636
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
3737
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.70 bpw quantization mix", },
38-
{ "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.95 bpw quantization mix", },
38+
{ "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.80 bpw quantization mix", },
39+
{ "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 3.95 bpw quantization mix", },
40+
{ "IQ3_XXXL", LLAMA_FTYPE_MOSTLY_IQ3_XXXL, " 4.10 bpw quantization mix", },
3941
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
4042
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
4143
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },

gguf-py/gguf/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,6 +1249,14 @@ class LlamaFileType(IntEnum):
12491249
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
12501250
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
12511251
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
1252+
MOSTLY_IQ2_XL = 38 # except 1d tensors
1253+
MOSTLY_IQ3_XL = 39 # except 1d tensors
1254+
MOSTLY_Q2_K_L = 40 # except 1d tensors
1255+
MOSTLY_IQ1_XS = 41 # except 1d tensors
1256+
MOSTLY_IQ1_XL = 42 # except 1d tensors
1257+
MOSTLY_IQ4_XSR = 43 # except 1d tensors
1258+
MOSTLY_IQ3_XXL = 44 # except 1d tensors
1259+
MOSTLY_IQ3_XXXL = 45 # except 1d tensors
12521260

12531261
GUESSED = 1024 # not specified in the model file
12541262

include/llama.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -166,12 +166,16 @@ extern "C" {
166166
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
167167
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
168168
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
169-
LLAMA_FTYPE_MOSTLY_IQ2_XL = 36, // except 1d tensors
170-
LLAMA_FTYPE_MOSTLY_IQ3_XL = 37, // except 1d tensors
171-
LLAMA_FTYPE_MOSTLY_Q2_K_L = 38, // except 1d tensors
172-
LLAMA_FTYPE_MOSTLY_IQ1_XS = 39, // except 1d tensors
173-
LLAMA_FTYPE_MOSTLY_IQ1_XL = 40, // except 1d tensors
174-
LLAMA_FTYPE_MOSTLY_IQ4_XSR = 41, // except 1d tensors
169+
LLAMA_FTYPE_MOSTLY_IQ2_XL = 38, // except 1d tensors
170+
LLAMA_FTYPE_MOSTLY_IQ3_XL = 39, // except 1d tensors
171+
LLAMA_FTYPE_MOSTLY_Q2_K_L = 40, // except 1d tensors
172+
LLAMA_FTYPE_MOSTLY_IQ1_XS = 41, // except 1d tensors
173+
LLAMA_FTYPE_MOSTLY_IQ1_XL = 42, // except 1d tensors
174+
LLAMA_FTYPE_MOSTLY_IQ4_XSR = 43, // except 1d tensors
175+
LLAMA_FTYPE_MOSTLY_IQ3_XXL = 44, // except 1d tensors
176+
LLAMA_FTYPE_MOSTLY_IQ3_XXXL = 45, // except 1d tensors
177+
LLAMA_FTYPE_CQS = 99, // except 1d tensors
178+
175179
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
176180
};
177181

src/llama.cpp

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4524,7 +4524,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
45244524
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
45254525
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
45264526
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw";
4527-
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.95 bpw";
4527+
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.80 bpw";
4528+
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw";
4529+
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.10 bpw";
45284530
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
45294531
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
45304532
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
@@ -15931,14 +15933,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1593115933
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
1593215934
else new_type = GGML_TYPE_IQ3_S;
1593315935
}
15934-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
15936+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
15937+
new_type = GGML_TYPE_IQ4_XS;
15938+
}
1593515939
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
1593615940
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
1593715941
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
1593815942
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
1593915943
new_type = GGML_TYPE_Q4_0;
1594015944
}
15941-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
1594215945
}
1594315946
} else if (name.find("attn_v.weight") != std::string::npos) {
1594415947
if (qs.model.hparams.n_expert >= 4) {
@@ -15979,7 +15982,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1597915982
new_type = GGML_TYPE_Q4_K;
1598015983
}
1598115984
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K;
15982-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
15985+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
15986+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
1598315987
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1598415988
else new_type = GGML_TYPE_IQ4_XS;
1598515989
}
@@ -16045,6 +16049,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1604516049
else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1604616050
}
1604716051
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16052+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16053+
new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16054+
else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16055+
}
16056+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16057+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16058+
new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16059+
else new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16060+
}
16061+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
1604816062
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1604916063
new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1605016064
else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
@@ -16114,6 +16128,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1611416128
new_type = GGML_TYPE_Q4_K;
1611516129
}
1611616130
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16131+
new_type = use_some_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16132+
}
16133+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16134+
new_type = use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16135+
}
16136+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
1611716137
new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1611816138
}
1611916139
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -16153,6 +16173,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1615316173
if (qs.model.hparams.n_expert >= 4) {
1615416174
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
1615516175
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16176+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ||
1615616177
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
1615716178
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
1615816179
ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
@@ -16173,7 +16194,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1617316194
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
1617416195
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
1617516196
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
16176-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
16197+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16198+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ4_XS;
1617716199
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1617816200
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1617916201
new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS :
@@ -16202,9 +16224,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1620216224
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_S;
1620316225
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
1620416226
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
16205-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_Q4_K;
16227+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_Q4_K;
1620616228
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
16207-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q5_K;
16229+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_Q5_K;
1620816230
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
1620916231
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
1621016232
++qs.i_attention_wv;
@@ -16223,7 +16245,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1622316245
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1622416246
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1622516247
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16226-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16248+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16249+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16250+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1622716251
++qs.i_ffn_gate;
1622816252
}
1622916253
else if (name.find("ffn_up") != std::string::npos) {
@@ -16239,8 +16263,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1623916263
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
1624016264
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1624116265
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16242-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16243-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16266+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16267+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16268+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1624416269
++qs.i_ffn_up;
1624516270
}
1624616271

@@ -16391,6 +16416,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1639116416
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
1639216417
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1639316418
case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
16419+
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: default_type = GGML_TYPE_IQ3_S; break;
16420+
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: default_type = GGML_TYPE_IQ3_S; break;
1639416421
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
1639516422
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
1639616423
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;

0 commit comments

Comments
 (0)