@@ -15345,32 +15345,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1534515345 new_type = GGML_TYPE_Q4_0;
1534615346 }
1534715347 }
15348- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15349- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15350- if (name.find("attn_v.weight") != std::string::npos) {
15351- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
15352- else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
15353- ++qs.i_attention_wv;
15354- }
15355- else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) {
15356- new_type = GGML_TYPE_Q4_K;
15357- }
15358- else if (name.find("ffn_down") != std::string::npos) {
15359- if (qs.i_ffn_down < qs.n_ffn_down/8) {
15360- new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
15361- }
15362- ++qs.i_ffn_down;
15363- }
15364- else if (name.find("attn_output.weight") != std::string::npos) {
15365- if (qs.model.hparams.n_expert >= 8) {
15366- new_type = GGML_TYPE_Q5_K;
15367- } else {
15368- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
15369- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
15348+ } else if (name.find("attn_v.weight") != std::string::npos) {
15349+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15350+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15351+ if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
15352+ else {
15353+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
15354+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ3_XXS;
15355+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
1537015356 }
15357+ ++qs.i_attention_wv;
1537115358 }
15372- } else if (name.find("attn_v.weight") != std::string::npos) {
15373- if (qs.model.hparams.n_expert >= 8) {
15359+ else if (qs.model.hparams.n_expert >= 8) {
1537415360 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
1537515361 // TODO: explore better strategies
1537615362 new_type = GGML_TYPE_Q8_0;
@@ -15380,7 +15366,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1538015366 // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
1538115367 // nearly negligible increase in model size by quantizing this tensor with more bits.
1538215368 // That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12).
15383- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
15369+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
15370+ new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
1538415371 }
1538515372 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
1538615373 new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -15412,7 +15399,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1541215399 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
1541315400 ++qs.i_attention_wv;
1541415401 } else if (name.find("attn_k.weight") != std::string::npos) {
15415- if (qs.model.hparams.n_expert >= 8) {
15402+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15403+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15404+ if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q5_K;
15405+ }
15406+ else if (qs.model.hparams.n_expert >= 8) {
1541615407 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
1541715408 // TODO: explore better strategies
1541815409 new_type = GGML_TYPE_Q8_0;
@@ -15430,6 +15421,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1543015421 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1543115422 new_type = GGML_TYPE_IQ3_XXS;
1543215423 }
15424+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15425+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15426+ if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q3_K;
15427+ }
1543315428 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1543415429 new_type = GGML_TYPE_IQ2_S;
1543515430 }
@@ -15440,6 +15435,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1544015435 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
1544115436 if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
1544215437 }
15438+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15439+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15440+ if (qs.i_ffn_down < qs.n_ffn_down/8) {
15441+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
15442+ }
15443+ ++qs.i_ffn_down;
15444+ }
1544315445 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
1544415446 new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1544515447 }
@@ -15482,10 +15484,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1548215484 if (arch != LLM_ARCH_FALCON) {
1548315485 if (qs.model.hparams.n_expert >= 8) {
1548415486 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
15485- ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
15487+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
1548615488 ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
15487- ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
15488- new_type = GGML_TYPE_Q5_K;
15489+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
15490+ }
15491+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15492+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15493+ if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K;
15494+ else {
15495+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
15496+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
1548915497 }
1549015498 } else {
1549115499 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
0 commit comments