Update IQ3_M attn_k and IQ3_XL token_embd

Nexesenex · Nexesenex · commit 17b71512a6f3 · 2024-08-18T04:12:15.000+02:00
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -15899,11 +15899,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 new_type = GGML_TYPE_IQ2_S;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
             else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
                 new_type = GGML_TYPE_Q4_0;
             }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ4_XS;
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;
         }
     } else if (name.find("attn_v.weight") != std::string::npos) {
         if (qs.model.hparams.n_expert >= 4) {
@@ -16003,9 +16004,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) &&
-                (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
-            new_type = GGML_TYPE_IQ4_XS;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
+                if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
+                else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ4_XS;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
             new_type = GGML_TYPE_Q4_K;

Original file line number	Diff line number	Diff line change
`@@ -15899,11 +15899,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n`
`15899`	`15899`	`new_type = GGML_TYPE_IQ2_S;`
`15900`	`15900`	`}`
`15901`	`15901`	`else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;`
	`15902`	`+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;`
`15902`	`15903`	`else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;`
`15903`	`15904`	`else if (new_type == GGML_TYPE_Q4_0_4_4 \|\| new_type == GGML_TYPE_Q4_0_4_8 \|\| new_type == GGML_TYPE_Q4_0_8_8) {`
`15904`	`15905`	`new_type = GGML_TYPE_Q4_0;`
`15905`	`15906`	`}`
`15906`		`- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ4_XS;`
	`15907`	`+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS;`
`15907`	`15908`	`}`
`15908`	`15909`	`} else if (name.find("attn_v.weight") != std::string::npos) {`
`15909`	`15910`	`if (qs.model.hparams.n_expert >= 4) {`
`@@ -16003,9 +16004,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n`
`16003`	`16004`	`else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) {`
`16004`	`16005`	`new_type = GGML_TYPE_IQ3_XXS;`
`16005`	`16006`	`}`
`16006`		`- else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) &&`
`16007`		`- (qs.model.hparams.n_gqa() >= 2 \|\| qs.model.hparams.n_expert >= 2)) {`
`16008`		`- new_type = GGML_TYPE_IQ4_XS;`
	`16007`	`+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {`
	`16008`	`+ if (qs.model.hparams.n_gqa() >= 4 \|\| qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;`
	`16009`	`+ else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ4_XS;`
`16009`	`16010`	`}`
`16010`	`16011`	`else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 \|\| qs.model.hparams.n_expert >= 2)) {`
`16011`	`16012`	`new_type = GGML_TYPE_Q4_K;`