@@ -5311,6 +5311,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
53115311 case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw";
53125312 case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.05 bpw";
53135313 case LLAMA_FTYPE_MOSTLY_IQ3_UXL: return "IQ3_S mix - 4.15 bpw";
5314+ case LLAMA_FTYPE_MOSTLY_IQ4_XXSR: return "IQ4_XS mix - 4.xx bpw";
53145315 case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
53155316 case LLAMA_FTYPE_MOSTLY_IQ4_MR: return "IQ4_XS mix - 4.xx bpw";
53165317 case LLAMA_FTYPE_MOSTLY_IQ4_LR: return "IQ4_XS mix - 4.xx bpw";
@@ -18451,7 +18452,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1845118452 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1845218453 else new_type = GGML_TYPE_Q5_K;
1845318454 }
18454- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18455+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XXSR || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1845518456 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
1845618457 else new_type = GGML_TYPE_Q5_K;
1845718458 }
@@ -18699,10 +18700,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1869918700 new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1870018701 else new_type = GGML_TYPE_IQ4_XS;
1870118702 }
18702- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18703+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XXSR || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1870318704 if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
18704- new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
18705- difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
18705+ new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1870618706 }
1870718707 else new_type = GGML_TYPE_IQ4_XS;
1870818708 }
@@ -18877,6 +18877,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1887718877 new_type = difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1887818878 else new_type = GGML_TYPE_Q3_K;
1887918879 }
18880+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XXSR) {
18881+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18882+ new_type = difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18883+ else new_type = GGML_TYPE_Q3_K;
18884+ }
1888018885 // else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_MR) {
1888118886 // if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1888218887 // new_type = GGML_TYPE_IQ3_S;
@@ -19936,10 +19941,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1993619941 case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
1993719942 case LLAMA_FTYPE_MOSTLY_IQ1_XL: default_type = GGML_TYPE_IQ1_M; break;
1993819943 case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
19944+ case LLAMA_FTYPE_MOSTLY_IQ4_XXSR:
19945+ case LLAMA_FTYPE_MOSTLY_IQ4_XSR:
19946+ case LLAMA_FTYPE_MOSTLY_IQ4_MR:
19947+ case LLAMA_FTYPE_MOSTLY_IQ4_LR:
1993919948 case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
19940- case LLAMA_FTYPE_MOSTLY_IQ4_XSR: default_type = GGML_TYPE_IQ4_XS; break;
19941- case LLAMA_FTYPE_MOSTLY_IQ4_MR: default_type = GGML_TYPE_IQ4_XS; break;
19942- case LLAMA_FTYPE_MOSTLY_IQ4_LR: default_type = GGML_TYPE_IQ4_XS; break;
1994319949 case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
1994419950 case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1994519951 case LLAMA_FTYPE_MOSTLY_IQ3_ML: default_type = GGML_TYPE_IQ3_S; break;
0 commit comments