Skip to content

Commit 7212098

Browse files
committed
IQ1 and IQ2 refactor
Attn_q in Q3_K for experts >= 8 Attn_k in Q5_K for experts >= 8 Attn_v in Q6_K for experts >= 8, in IQ3_XXS for IQ2_XXS and IQ2_XS Attn_output in Q4_K for experts >= 8
1 parent 1bc4dc5 commit 7212098

File tree

1 file changed

+37
-29
lines changed

1 file changed

+37
-29
lines changed

src/llama.cpp

Lines changed: 37 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15345,32 +15345,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1534515345
new_type = GGML_TYPE_Q4_0;
1534615346
}
1534715347
}
15348-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15349-
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15350-
if (name.find("attn_v.weight") != std::string::npos) {
15351-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
15352-
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
15353-
++qs.i_attention_wv;
15354-
}
15355-
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) {
15356-
new_type = GGML_TYPE_Q4_K;
15357-
}
15358-
else if (name.find("ffn_down") != std::string::npos) {
15359-
if (qs.i_ffn_down < qs.n_ffn_down/8) {
15360-
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
15361-
}
15362-
++qs.i_ffn_down;
15363-
}
15364-
else if (name.find("attn_output.weight") != std::string::npos) {
15365-
if (qs.model.hparams.n_expert >= 8) {
15366-
new_type = GGML_TYPE_Q5_K;
15367-
} else {
15368-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
15369-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
15348+
} else if (name.find("attn_v.weight") != std::string::npos) {
15349+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15350+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15351+
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
15352+
else {
15353+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
15354+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ3_XXS;
15355+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
1537015356
}
15357+
++qs.i_attention_wv;
1537115358
}
15372-
} else if (name.find("attn_v.weight") != std::string::npos) {
15373-
if (qs.model.hparams.n_expert >= 8) {
15359+
else if (qs.model.hparams.n_expert >= 8) {
1537415360
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
1537515361
// TODO: explore better strategies
1537615362
new_type = GGML_TYPE_Q8_0;
@@ -15380,7 +15366,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1538015366
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
1538115367
// nearly negligible increase in model size by quantizing this tensor with more bits.
1538215368
// That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12).
15383-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
15369+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
15370+
new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
1538415371
}
1538515372
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
1538615373
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -15412,7 +15399,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1541215399
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
1541315400
++qs.i_attention_wv;
1541415401
} else if (name.find("attn_k.weight") != std::string::npos) {
15415-
if (qs.model.hparams.n_expert >= 8) {
15402+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15403+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15404+
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q5_K;
15405+
}
15406+
else if (qs.model.hparams.n_expert >= 8) {
1541615407
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
1541715408
// TODO: explore better strategies
1541815409
new_type = GGML_TYPE_Q8_0;
@@ -15430,6 +15421,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1543015421
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1543115422
new_type = GGML_TYPE_IQ3_XXS;
1543215423
}
15424+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15425+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15426+
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q3_K;
15427+
}
1543315428
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1543415429
new_type = GGML_TYPE_IQ2_S;
1543515430
}
@@ -15440,6 +15435,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1544015435
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
1544115436
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
1544215437
}
15438+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15439+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15440+
if (qs.i_ffn_down < qs.n_ffn_down/8) {
15441+
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
15442+
}
15443+
++qs.i_ffn_down;
15444+
}
1544315445
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
1544415446
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1544515447
}
@@ -15482,10 +15484,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1548215484
if (arch != LLM_ARCH_FALCON) {
1548315485
if (qs.model.hparams.n_expert >= 8) {
1548415486
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
15485-
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
15487+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
1548615488
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
15487-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
15488-
new_type = GGML_TYPE_Q5_K;
15489+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
15490+
}
15491+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15492+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15493+
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K;
15494+
else {
15495+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
15496+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
1548915497
}
1549015498
} else {
1549115499
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;

0 commit comments

Comments
 (0)