@@ -17801,18 +17801,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1780117801 new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1780217802 }
1780317803 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
17804- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_XS;
17805- // new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17804+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1780617805 else new_type = GGML_TYPE_IQ3_S;
1780717806 }
1780817807 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
17809- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
17810- new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17808+ if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1781117809 else new_type = GGML_TYPE_Q4_K;
1781217810 }
1781317811 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
17814- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17815- new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17812+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1781617813 else new_type = GGML_TYPE_Q4_K;
1781717814 }
1781817815 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
@@ -17853,9 +17850,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1785317850 // if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1785417851 // else new_type = GGML_TYPE_Q4_K;
1785517852 // }
17856- else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) &&
17857- (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
17858- new_type = GGML_TYPE_Q6_K;
17853+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
17854+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
1785917855 else new_type = GGML_TYPE_Q5_K;
1786017856 }
1786117857 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
@@ -18038,13 +18034,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1803818034 else new_type = GGML_TYPE_IQ3_XXS;
1803918035 }
1804018036 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18041- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18042- new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18037+ if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
18038+ // new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1804318039 else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1804418040 }
1804518041 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18046- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18047- new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18042+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
18043+ // new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1804818044 else new_type = GGML_TYPE_IQ3_S;
1804918045 }
1805018046 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
0 commit comments