Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IQ1_XS FTYPE quant strategy #6310

Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f4949bc
b2532
Nexesenex Mar 25, 2024
8f7a7ee
Update quantize.cpp - Quant option IQ1_XS
Nexesenex Mar 25, 2024
3d88431
Update llama.h - Enum IQ1_XS
Nexesenex Mar 25, 2024
8eff402
Update llama.cpp - Case IQ1_XS
Nexesenex Mar 25, 2024
51ff04e
Update llama.cpp - Fix possible typo
Nexesenex Mar 25, 2024
1c4da5d
Update llama.cpp - Embeddings and output tensors strategy.
Nexesenex Mar 25, 2024
ddc7701
Update llama.cpp - Non-FFN layer-tensors strategy
Nexesenex Mar 25, 2024
b355333
Update llama.h - change IQ1_XS enum number
Nexesenex Mar 25, 2024
066efbb
Update llama.cpp - adjustements non-FFN layer tensors
Nexesenex Mar 25, 2024
3031c01
Update llama.cpp - correction wrong case declaration
Nexesenex Mar 25, 2024
9c27b0e
Update quantize.cpp - mix label
Nexesenex Mar 26, 2024
f162b2e
Update llama.cpp - correction embd.weight GQA-4 & qkv.weight to K-Quants
Nexesenex Mar 26, 2024
62c1f5b
Update llama.cpp typo
Nexesenex Mar 26, 2024
d183936
Update llama.cpp - remove trailing space
Nexesenex Mar 26, 2024
eaf9571
Update llama.cpp - exception for the IQ2_S token embedding error
Nexesenex Mar 26, 2024
599a4b2
Update llama.cpp - switch from IQ4_XS to Q4_K in related cases.
Nexesenex Mar 26, 2024
915be46
Merge branch 'master' into Nexesenex-IQ1_XS-IQ1_S-quant-strategies
Nexesenex Mar 26, 2024
ed4be6b
Update llama.cpp
Nexesenex Mar 29, 2024
dce3e27
Update llama.cpp - adjustements
Nexesenex Apr 1, 2024
3a83878
Merge branch 'master' into Nexesenex-IQ1_XS-IQ1_S-quant-strategies
mofosyne May 10, 2024
e4ac8ae
Update llama.h respect current numerology
Nexesenex May 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
{ "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization mix", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M", },
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
Expand Down
44 changes: 43 additions & 1 deletion llama.cpp
Nexesenex marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -3413,6 +3413,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_XS :return "IQ1_S mix - 1.6-1.7 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
Expand Down Expand Up @@ -12446,6 +12447,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
Nexesenex marked this conversation as resolved.
Show resolved Hide resolved
else new_type = GGML_TYPE_IQ4_XS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
new_type = GGML_TYPE_Q5_K;
Expand All @@ -12461,13 +12466,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
new_type = GGML_TYPE_Q2_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
if (qs.model.hparams.n_gqa() == 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q2_K;
else new_type = GGML_TYPE_IQ2_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
new_type = GGML_TYPE_IQ3_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S;
}
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
if (name.find("attn_q.weight") != std::string::npos) {
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ2_S;
else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ2_XS;
else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XXS;
}
if (name.find("attn_k.weight") != std::string::npos) {
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS;
else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S;
else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS;
else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_IQ2_S;
else if (qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ2_XS;
else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS;
}
else if (name.find("attn_v.weight") != std::string::npos) {
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q5_K;
else if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS;
else new_type = GGML_TYPE_Q2_K;
++qs.i_attention_wv;
}
else if (name.find("attn_output.weight") != std::string::npos) {
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS;
else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S;
else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS;
else new_type = GGML_TYPE_IQ2_XXS;
}
else if (name.find("attn_qkv.weight") != std::string::npos) {
if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
else new_type = GGML_TYPE_Q2_K;
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
if (name.find("attn_v.weight") != std::string::npos) {
Expand Down Expand Up @@ -12655,7 +12696,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
int nx = tensor->ne[0];
int ny = tensor->ne[1];
if (nx % QK_K != 0) {
Expand Down Expand Up @@ -12754,6 +12795,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
case LLAMA_FTYPE_MOSTLY_IQ1_XS: default_type = GGML_TYPE_IQ1_S; break;
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
Expand Down
1 change: 1 addition & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_XS = 32, // except 1d tensors

LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
Expand Down
Loading