@@ -4524,7 +4524,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
45244524 case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
45254525 case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
45264526 case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw";
4527- case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.95 bpw";
4527+ case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.80 bpw";
4528+ case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw";
4529+ case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.10 bpw";
45284530 case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
45294531 case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
45304532 case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
@@ -15931,14 +15933,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1593115933 if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
1593215934 else new_type = GGML_TYPE_IQ3_S;
1593315935 }
15934- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
15936+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
15937+ new_type = GGML_TYPE_IQ4_XS;
15938+ }
1593515939 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
1593615940 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
1593715941 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
1593815942 else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
1593915943 new_type = GGML_TYPE_Q4_0;
1594015944 }
15941- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
1594215945 }
1594315946 } else if (name.find("attn_v.weight") != std::string::npos) {
1594415947 if (qs.model.hparams.n_expert >= 4) {
@@ -15979,7 +15982,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1597915982 new_type = GGML_TYPE_Q4_K;
1598015983 }
1598115984 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K;
15982- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
15985+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
15986+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
1598315987 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1598415988 else new_type = GGML_TYPE_IQ4_XS;
1598515989 }
@@ -16045,6 +16049,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1604516049 else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1604616050 }
1604716051 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16052+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16053+ new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16054+ else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16055+ }
16056+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16057+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16058+ new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16059+ else new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16060+ }
16061+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
1604816062 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1604916063 new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1605016064 else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
@@ -16114,6 +16128,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1611416128 new_type = GGML_TYPE_Q4_K;
1611516129 }
1611616130 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16131+ new_type = use_some_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16132+ }
16133+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16134+ new_type = use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16135+ }
16136+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
1611716137 new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1611816138 }
1611916139 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -16153,6 +16173,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1615316173 if (qs.model.hparams.n_expert >= 4) {
1615416174 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
1615516175 ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16176+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ||
1615616177 ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
1615716178 ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
1615816179 ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
@@ -16173,7 +16194,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1617316194 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
1617416195 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
1617516196 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
16176- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
16197+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16198+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ4_XS;
1617716199 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1617816200 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1617916201 new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS :
@@ -16202,9 +16224,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1620216224 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_S;
1620316225 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
1620416226 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
16205- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_Q4_K;
16227+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL ) new_type = GGML_TYPE_Q4_K;
1620616228 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
16207- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q5_K;
16229+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ) new_type = GGML_TYPE_Q5_K;
1620816230 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
1620916231 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
1621016232 ++qs.i_attention_wv;
@@ -16223,7 +16245,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1622316245 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1622416246 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1622516247 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16226- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16248+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16249+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16250+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1622716251 ++qs.i_ffn_gate;
1622816252 }
1622916253 else if (name.find("ffn_up") != std::string::npos) {
@@ -16239,8 +16263,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1623916263 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
1624016264 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1624116265 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16242- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16243- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16266+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16267+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16268+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1624416269 ++qs.i_ffn_up;
1624516270 }
1624616271
@@ -16391,6 +16416,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1639116416 case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
1639216417 case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1639316418 case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
16419+ case LLAMA_FTYPE_MOSTLY_IQ3_XXL: default_type = GGML_TYPE_IQ3_S; break;
16420+ case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: default_type = GGML_TYPE_IQ3_S; break;
1639416421 case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
1639516422 case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
1639616423 case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
0 commit comments