Skip to content

Commit 413fc43

Browse files
committed
Fix IQ3 <=M
1 parent 9ed3522 commit 413fc43

File tree

1 file changed

+33
-33
lines changed

1 file changed

+33
-33
lines changed

src/llama.cpp

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17814,17 +17814,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1781417814
}
1781517815
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
1781617816
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17817-
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17817+
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1781817818
else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1781917819
}
1782017820
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
1782117821
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
17822-
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17822+
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1782317823
else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1782417824
}
1782517825
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1782617826
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
17827-
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17827+
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1782817828
else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1782917829
}
1783017830
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
@@ -18246,7 +18246,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1824618246
}
1824718247
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1824818248
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18249-
new_type = difquant_six_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18249+
new_type = difquant_five_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ3_XXS;
1825018250
else new_type = GGML_TYPE_IQ3_XXS;
1825118251
}
1825218252
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
@@ -18594,24 +18594,31 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1859418594
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) {
1859518595
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)
1859618596
new_type = GGML_TYPE_IQ2_XS;
18597-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
18598-
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
18599-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
18600-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
18601-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18602-
new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18603-
else new_type = GGML_TYPE_IQ3_XXS;
18604-
}
18605-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18606-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18607-
new_type = difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18608-
else new_type = difquant_fl_more_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18609-
}
18610-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18611-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18612-
new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18613-
else new_type = GGML_TYPE_IQ3_S;
18614-
}
18597+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
18598+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
18599+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS)
18600+
new_type = GGML_TYPE_IQ3_XXS;
18601+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S)
18602+
new_type = GGML_TYPE_IQ3_XXS;
18603+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
18604+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL)
18605+
new_type = GGML_TYPE_IQ3_S;
18606+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q3_K;
18607+
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
18608+
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18609+
// new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18610+
// else new_type = GGML_TYPE_IQ3_XXS;
18611+
// }
18612+
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18613+
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18614+
// new_type = difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18615+
// else new_type = difquant_fl_more_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18616+
// }
18617+
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18618+
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18619+
// new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18620+
// else new_type = GGML_TYPE_IQ3_S;
18621+
// }
1861518622
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1861618623
// if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1861718624
// new_type = difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
@@ -18664,13 +18671,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1866418671
// new_type = GGML_TYPE_IQ4_XS;
1866518672
// else new_type = GGML_TYPE_Q3_K;
1866618673
// }
18667-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS)
18668-
new_type = GGML_TYPE_IQ3_S;
18669-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS;
18670-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
18671-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL)
18672-
new_type = GGML_TYPE_IQ3_S;
18673-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q3_K;
1867418674
} else {
1867518675
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K;
1867618676
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL)
@@ -18680,8 +18680,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1868018680
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS;
1868118681
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
1868218682
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
18683-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_S;
18684-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS;
18683+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_XXS;
18684+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_S;
1868518685
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML) new_type = GGML_TYPE_IQ3_S;
1868618686
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_IQ3_S;
1868718687
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) new_type = GGML_TYPE_IQ3_S;
@@ -18948,7 +18948,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1894818948
}
1894918949
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1895018950
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18951-
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18951+
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1895218952
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1895318953
}
1895418954
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
@@ -19083,7 +19083,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1908319083
}
1908419084
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1908519085
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19086-
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
19086+
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1908719087
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1908819088
}
1908919089
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {

0 commit comments

Comments
 (0)