@@ -15866,17 +15866,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1586615866 const llm_arch arch = qs.model.arch;
1586715867 const auto tn = LLM_TN(arch);
1586815868
15869- // difquant_init_tensors has a broad 12.5 % bump to the upper quant.
15870- auto difquant_init_tensors = [](int i_layer, int n_layers) -> bool {
15871- return i_layer <= n_layers/8;
15869+ // difquant_first_last_tensors has a broad 13.75-16.66 % bump to the upper quant.
15870+ auto difquant_first_last_tensors = [](int i_layer, int n_layers) -> bool {
15871+ return i_layer < n_layers/8 || i_layer >= n_layers-2 ;
1587215872 };
15873- // difquant_init_end_tensors has a broad 25 % bump to the upper quant.
15874- auto difquant_init_end_tensors = [](int i_layer, int n_layers) -> bool {
15875- return i_layer <= n_layers/8 || i_layer > 7*n_layers/8;
15873+ // difquant_more_fl_tensors has a broad 26-29 % bump to the upper quant.
15874+ auto difquant_more_fl_tensors = [](int i_layer, int n_layers) -> bool {
15875+ return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8;
1587615876 };
1587715877 // difquant_three_eights_tensors has a broad 37.5% bump to the upper quant.
1587815878 auto difquant_three_eights_tensors = [](int i_layer, int n_layers) -> bool {
15879- return i_layer <= n_layers/8 || i_layer > 7*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
15879+ return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer > 2*n_layers/8 && i_layer < 3*n_layers/8);
1588015880 };
1588115881 // original formula use_more_bits :
1588215882 // return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
@@ -16028,8 +16028,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1602816028 }
1602916029 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1603016030 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16031- new_type = difquant_init_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16032- else new_type = difquant_init_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16031+ new_type = difquant_first_last_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16032+ else new_type = difquant_first_last_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1603316033 }
1603416034 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
1603516035 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
@@ -16102,8 +16102,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1610216102 }
1610316103 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1610416104 if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
16105- new_type = difquant_init_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16106- else new_type = difquant_init_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16105+ new_type = difquant_first_last_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16106+ else new_type = difquant_first_last_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1610716107 }
1610816108 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1610916109 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16115,8 +16115,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1611516115 }
1611616116 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
1611716117 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16118- new_type = difquant_init_end_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16119- else new_type = difquant_init_end_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16118+ new_type = difquant_more_fl_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16119+ else new_type = difquant_more_fl_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1612016120 }
1612116121 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
1612216122 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -16199,7 +16199,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1619916199 // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
1620016200 new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
1620116201 }
16202- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
16202+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1620316203 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
1620416204 if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS;
1620516205 }
@@ -16213,22 +16213,22 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1621316213 }
1621416214 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
1621516215 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16216- new_type = difquant_init_end_tensors (i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16216+ new_type = difquant_more_fl_tensors (i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1621716217 }
1621816218 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
1621916219 new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1622016220 }
1622116221 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16222- new_type = difquant_init_tensors (i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16222+ new_type = difquant_first_last_tensors (i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1622316223 }
1622416224 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1622516225 new_type = difquant_five_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1622616226 }
1622716227 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16228- new_type = difquant_init_end_tensors (i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16228+ new_type = difquant_more_fl_tensors (i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1622916229 }
1623016230 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16231- new_type = difquant_init_end_tensors (i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16231+ new_type = difquant_more_fl_tensors (i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1623216232 }
1623316233 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
1623416234 new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
@@ -16331,18 +16331,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1633116331 auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
1633216332 int i_layer = info.first, n_layer = info.second;
1633316333 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
16334- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
16334+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
1633516335 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1633616336 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1633716337 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1633816338 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
1633916339 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
1634016340 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
16341- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16341+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_more_fl_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1634216342 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16343- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16343+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_first_last_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1634416344 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16345- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_init_end_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16345+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_more_fl_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1634616346 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1634716347 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1634816348 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
@@ -16351,18 +16351,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1635116351 auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
1635216352 int i_layer = info.first, n_layer = info.second;
1635316353 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
16354- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
16354+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M;
1635516355 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1635616356 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1635716357 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS;
1635816358 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS;
1635916359 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
1636016360 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S;
16361- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16361+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_more_fl_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
1636216362 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS;
16363- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16363+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_first_last_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
1636416364 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S;
16365- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_init_end_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16365+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_more_fl_tensors (i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1636616366 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1636716367 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
1636816368 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
0 commit comments