@@ -17768,14 +17768,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1776817768 new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
1776917769 else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1777017770 }
17771- else if (qs.model.hparams.n_gqa() >= 7) {
17771+ // else if (qs.model.hparams.n_gqa() >= 7) {
1777217772 // The Llama 70B models have 8 heads sharing the same attn_v weights (-> GQA 8). As a result, the attn_v.weight tensor is
1777317773 // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
1777417774 // nearly negligible increase in model size by quantizing this tensor with more bits.
1777517775 // That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12).
17776- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
17777- new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
17778- }
17776+ // if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
17777+ // new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
17778+ // }
1777917779 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
1778017780 new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1778117781 }
@@ -17797,30 +17797,43 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1779717797 ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1779817798 new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
1779917799 }
17800- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL ) {
17800+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1780117801 new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1780217802 }
17803+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
17804+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17805+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
17806+ else new_type = GGML_TYPE_IQ3_S;
17807+ }
1780317808 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
17804- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17809+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17810+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1780517811 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1780617812 else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1780717813 }
1780817814 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
17809- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17815+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17816+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1781017817 new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1781117818 else new_type = GGML_TYPE_Q4_K;
1781217819 }
17813- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
17814- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17815- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
17820+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
17821+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17822+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
17823+ else new_type = GGML_TYPE_Q4_K;
17824+ }
17825+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17826+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
17827+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1781617828 else new_type = GGML_TYPE_Q4_K;
1781717829 }
1781817830 else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) &&
1781917831 (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1782017832 new_type = GGML_TYPE_Q5_K;
1782117833 }
1782217834 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
17823- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
17835+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
17836+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1782417837 new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
1782517838 difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1782617839 }
@@ -17993,41 +18006,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1799318006 else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1799418007 }
1799518008 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
17996- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
18009+ if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
18010+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
1799718011 else new_type = GGML_TYPE_IQ3_XXS;
1799818012 }
1799918013 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18000- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18014+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18015+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
1800118016 new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1800218017 else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1800318018 }
1800418019 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18005- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18020+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18021+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1800618022 new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1800718023 else new_type = GGML_TYPE_IQ3_S;
1800818024 }
1800918025 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
18010- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18026+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
18027+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1801118028 new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1801218029 else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1801318030 }
1801418031 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
18015- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18032+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18033+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1801618034 new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1801718035 else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1801818036 }
1801918037 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
18020- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18038+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18039+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1802118040 new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1802218041 else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1802318042 }
1802418043 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18025- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18044+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
18045+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1802618046 new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1802718047 else new_type = GGML_TYPE_IQ4_XS;
1802818048 }
1802918049 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18030- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
18050+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
18051+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1803118052 new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
1803218053 difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1803318054 }
@@ -18139,8 +18160,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1813918160 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
1814018161 }
1814118162 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
18142- ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
18143- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18163+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1814418164 if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
1814518165 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1814618166 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
0 commit comments