Skip to content

Commit 2d052f7

Browse files
committed
difquants three/four eights alt for Mistral Large
1 parent 29cecae commit 2d052f7

File tree

1 file changed

+21
-5
lines changed

1 file changed

+21
-5
lines changed

src/llama.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18106,6 +18106,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1810618106
auto difquant_three_eights_tensors = [](int i_layer, int n_layers) -> bool {
1810718107
return i_layer <= n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8) || i_layer > 7*n_layers/8;
1810818108
};
18109+
// difquant_three_eights_alt_tensors (for Mistral Large) has a broad 37.5% bump to the upper quant. Ex : 12/32
18110+
auto difquant_three_eights_alt_tensors = [](int i_layer, int n_layers) -> bool {
18111+
return i_layer <= n_layers/8 || (i_layer > 4*n_layers/8 && i_layer < 5*n_layers/8) || i_layer >= 7*n_layers/8;
18112+
};
1810918113
// original formula use_more_bits :
1811018114
// return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
1811118115
// The intervals of 3 are replaced by a broad bump in the central layers.
@@ -18117,11 +18121,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1811718121
// (i_layer > 5*n_layers/8 && i_layer <= 6*n_layers/8) || i_layer > 7*n_layers/8;
1811818122
return i_layer <= n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8) || i_layer > 6*n_layers/8;
1811918123
};
18124+
// difquant_half_alt tensors (for Mistral Large) has the broad 50% bump to the upper quant. Ex : 16/32
18125+
auto difquant_half_alt_tensors = [](int i_layer, int n_layers) -> bool {
18126+
return i_layer <= n_layers/8 || (i_layer > 4*n_layers/8 && i_layer < 5*n_layers/8) || i_layer >= 6*n_layers/8;
18127+
};
1812018128
// difquant_five_eights_tensors has a broad 62.5% bump to the upper quant. Ex : 20/32
1812118129
auto difquant_five_eights_tensors = [](int i_layer, int n_layers) -> bool {
1812218130
return i_layer <= n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8) || i_layer > 5*n_layers/8;
1812318131
};
18124-
// difquant_five_eights_alt_tensors has a broad 62.5% bump to the upper quant. Ex : 20/32
18132+
// difquant_five_eights_alt_tensors (for Mistral Large) has a broad 62.5% bump to the upper quant. Ex : 20/32
1812518133
auto difquant_five_eights_alt_tensors = [](int i_layer, int n_layers) -> bool {
1812618134
return i_layer <= n_layers/8 || i_layer > 4*n_layers/8;
1812718135
};
@@ -18134,7 +18142,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1813418142
auto difquant_seven_eights_tensors = [](int i_layer, int n_layers) -> bool {
1813518143
return i_layer <= 6*n_layers/8 || i_layer > 7*n_layers/8; //best ppl
1813618144
};
18137-
// difquant_seven_eights_alt_tensors has a broad 87.5% bump to the upper quant. Ex : 28/32
18145+
// difquant_seven_eights_alt_tensors (for Mistral Large) has a broad 87.5% bump to the upper quant. Ex : 28/32
1813818146
auto difquant_seven_eights_alt_tensors = [](int i_layer, int n_layers) -> bool {
1813918147
// return i_layer <= 5*n_layers/8 || i_layer > 6*n_layers/8 (cata)
1814018148
return i_layer <= 3*n_layers/8 || i_layer > 4*n_layers/8; /* for my 36GB VRAM use; */
@@ -19188,6 +19196,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1918819196
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_X4L) {
1918919197
if (qs.model.hparams.n_gqa() >= 12 || qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1919019198
new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19199+
else if (qs.model.hparams.n_gqa() >= 12)
19200+
new_type = (difquant_half_alt_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1919119201
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1919219202
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_Q3_K;
1919319203
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
@@ -19407,8 +19417,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1940719417
else new_type = difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1940819418
}
1940919419
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_X4L) {
19410-
if (qs.model.hparams.n_gqa() >= 12 || qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19420+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1941119421
new_type = difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19422+
else if (qs.model.hparams.n_gqa() >= 12)
19423+
new_type = difquant_half_alt_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1941219424
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1941319425
new_type = difquant_six_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_Q3_K;
1941419426
else new_type = difquant_five_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
@@ -19826,8 +19838,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1982619838
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1982719839
}
1982819840
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_X4L) {
19829-
if (qs.model.hparams.n_gqa() >= 12 || qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19841+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1983019842
new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
19843+
else if (qs.model.hparams.n_gqa() >= 12)
19844+
new_type = (difquant_half_alt_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1983119845
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1983219846
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_Q3_K;
1983319847
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
@@ -20009,8 +20023,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
2000920023
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
2001020024
}
2001120025
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_X4L) {
20012-
if (qs.model.hparams.n_gqa() >= 12 || qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
20026+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
2001320027
new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
20028+
else if (qs.model.hparams.n_gqa() >= 12)
20029+
new_type = (difquant_half_alt_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
2001420030
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
2001520031
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_Q3_K;
2001620032
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;

0 commit comments

Comments
 (0)