@@ -17814,17 +17814,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1781417814 }
1781517815 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
1781617816 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17817- new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17817+ new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1781817818 else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1781917819 }
1782017820 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1782117821 if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
17822- new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17822+ new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1782317823 else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1782417824 }
1782517825 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1782617826 if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
17827- new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17827+ new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1782817828 else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1782917829 }
1783017830 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
@@ -18246,7 +18246,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1824618246 }
1824718247 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1824818248 if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18249- new_type = difquant_six_eights_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S ;
18249+ new_type = difquant_five_eights_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ3_XXS ;
1825018250 else new_type = GGML_TYPE_IQ3_XXS;
1825118251 }
1825218252 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
@@ -18594,24 +18594,31 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1859418594 else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) {
1859518595 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)
1859618596 new_type = GGML_TYPE_IQ2_XS;
18597- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
18598- ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
18599- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
18600- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
18601- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18602- new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18603- else new_type = GGML_TYPE_IQ3_XXS;
18604- }
18605- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18606- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18607- new_type = difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18608- else new_type = difquant_fl_more_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18609- }
18610- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18611- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18612- new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18613- else new_type = GGML_TYPE_IQ3_S;
18614- }
18597+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
18598+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
18599+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS)
18600+ new_type = GGML_TYPE_IQ3_XXS;
18601+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S)
18602+ new_type = GGML_TYPE_IQ3_XXS;
18603+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
18604+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL)
18605+ new_type = GGML_TYPE_IQ3_S;
18606+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q3_K;
18607+ // else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
18608+ // if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18609+ // new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18610+ // else new_type = GGML_TYPE_IQ3_XXS;
18611+ // }
18612+ // else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18613+ // if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18614+ // new_type = difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18615+ // else new_type = difquant_fl_more_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18616+ // }
18617+ // else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18618+ // if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18619+ // new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18620+ // else new_type = GGML_TYPE_IQ3_S;
18621+ // }
1861518622 // else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1861618623 // if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1861718624 // new_type = difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
@@ -18664,13 +18671,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1866418671 // new_type = GGML_TYPE_IQ4_XS;
1866518672 // else new_type = GGML_TYPE_Q3_K;
1866618673 // }
18667- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS)
18668- new_type = GGML_TYPE_IQ3_S;
18669- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS;
18670- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
18671- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL)
18672- new_type = GGML_TYPE_IQ3_S;
18673- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q3_K;
1867418674 } else {
1867518675 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K;
1867618676 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL)
@@ -18680,8 +18680,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1868018680 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS;
1868118681 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
1868218682 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
18683- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_S ;
18684- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS ;
18683+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_XXS ;
18684+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_S ;
1868518685 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML) new_type = GGML_TYPE_IQ3_S;
1868618686 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_IQ3_S;
1868718687 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) new_type = GGML_TYPE_IQ3_S;
@@ -18948,7 +18948,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1894818948 }
1894918949 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1895018950 if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18951- new_type = (difquant_six_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18951+ new_type = (difquant_five_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1895218952 else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1895318953 }
1895418954 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
@@ -19083,7 +19083,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1908319083 }
1908419084 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1908519085 if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19086- new_type = (difquant_six_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
19086+ new_type = (difquant_five_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1908719087 else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1908819088 }
1908919089 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
0 commit comments