@@ -18618,7 +18618,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1861818618 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS)
1861918619 new_type = GGML_TYPE_IQ3_XXS;
1862018620 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS)
18621- new_type = GGML_TYPE_Q3_K ;
18621+ new_type = GGML_TYPE_IQ3_XXS ;
1862218622 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S)
1862318623 new_type = GGML_TYPE_IQ3_S;
1862418624 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
@@ -18833,7 +18833,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1883318833 }
1883418834 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1883518835 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18836- new_type = difquant_six_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18836+ new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1883718837 else new_type = GGML_TYPE_IQ4_XS;
1883818838 }
1883918839 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
@@ -18971,7 +18971,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1897118971 }
1897218972 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1897318973 if (qs.model.hparams.n_vocab >= 151600 && qs.model.hparams.n_vocab <=151700)
18974- new_type = (difquant_five_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
18974+ new_type = (difquant_first_last_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
1897518975 else if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1897618976 new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1897718977 else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
@@ -19104,7 +19104,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1910419104 }
1910519105 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1910619106 if (qs.model.hparams.n_vocab >= 151600 && qs.model.hparams.n_vocab <=151700)
19107- new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
19107+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
1910819108 else if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1910919109 new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1911019110 }
0 commit comments