@@ -15847,9 +15847,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1584715847 const llm_arch arch = qs.model.arch;
1584815848 const auto tn = LLM_TN(arch);
1584915849
15850+ auto use_few_bits = [](int i_layer, int n_layers) -> bool {
15851+ return i_layer <= n_layers/8 || i_layer > 7*n_layers/8;
15852+ };
15853+ //few_bits has a broad 25% bump to the upper quant.
15854+ auto use_some_bits = [](int i_layer, int n_layers) -> bool {
15855+ return i_layer <= n_layers/8 || i_layer > 7*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
15856+ };
15857+ // return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
15858+ // The intervals of 3 are replaced by a broad bump in the central layers. some_bits has a broad 37.5% bump to the upper quant.
1585015859 auto use_more_bits = [](int i_layer, int n_layers) -> bool {
15851- return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
15860+ return i_layer <= n_layers/8 || i_layer > 6*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
15861+ };
15862+ //more_bits has a broad 50% bump to the upper quant.
15863+ auto use_many_bits = [](int i_layer, int n_layers) -> bool {
15864+ return i_layer <= n_layers/8 || i_layer > 5*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 4*n_layers/8);
1585215865 };
15866+ //many_bits has a broad 75% bump to the upper quant.
1585315867 const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
1585415868 auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
1585515869 if (n_expert > 1) {
@@ -15917,10 +15931,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1591715931 if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
1591815932 else new_type = GGML_TYPE_IQ3_S;
1591915933 }
15920- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
15921- if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_S;
15922- else new_type = GGML_TYPE_IQ4_XS;
15923- }
15934+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
1592415935 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
1592515936 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
1592615937 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
@@ -15969,7 +15980,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1596915980 }
1597015981 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K;
1597115982 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
15972- new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15983+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
15984+ else new_type = GGML_TYPE_IQ4_XS;
1597315985 }
1597415986 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
1597515987 new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -15988,7 +16000,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1598816000 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1598916001 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1599016002 new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
15991- use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16003+ use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1599216004 }
1599316005 }
1599416006 ++qs.i_attention_wv;
@@ -16027,9 +16039,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1602716039 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) {
1602816040 new_type = GGML_TYPE_IQ3_XXS;
1602916041 }
16030- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16031- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16032- else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ4_XS;
16042+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16043+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16044+ new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16045+ else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16046+ }
16047+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16048+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16049+ new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16050+ else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1603316051 }
1603416052 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1603516053 new_type = GGML_TYPE_Q4_K;
@@ -16059,8 +16077,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1605916077 }
1606016078 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1606116079 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16062- new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
16063- use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS ;
16080+ new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ3_S :
16081+ use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S ;
1606416082 }
1606516083 }
1606616084 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ3_S;
@@ -16091,11 +16109,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1609116109 : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
1609216110 : GGML_TYPE_Q3_K;
1609316111 }
16094- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
16112+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (use_some_bits( i_layer, n_layer) ||
1609516113 (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) {
1609616114 new_type = GGML_TYPE_Q4_K;
1609716115 }
16098- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16116+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16117+ new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16118+ }
1609916119 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
1610016120 new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
1610116121 }
@@ -16193,30 +16213,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1619316213 auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
1619416214 int i_layer = info.first, n_layer = info.second;
1619516215 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
16196- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
16197- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
16216+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
16217+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16218+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1619816219 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16199- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS;
16200- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S;
16201- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS;
16220+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
16221+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
16222+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
16223+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1620216224 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1620316225 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16204- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16226+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1620516227 ++qs.i_ffn_gate;
1620616228 }
1620716229 else if (name.find("ffn_up") != std::string::npos) {
1620816230 auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1620916231 int i_layer = info.first, n_layer = info.second;
1621016232 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
16211- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
16212- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
16233+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
16234+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16235+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1621316236 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16214- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS;
16215- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S;
16216- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS;
16237+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
16238+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
16239+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
16240+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1621716241 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1621816242 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16219- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16243+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1622016244 ++qs.i_ffn_up;
1622116245 }
1622216246
0 commit comments