attn.output.tensor of FYPE IQ3_M in IQ4_XS

Nexesenex · web-flow · commit 93c35f86a961 · 2024-08-04T11:59:52.000+02:00
If FTYPE IQ4_XS has attn.output.tensor in IQ4_XS (4.5BPW), there's no reason to have FTYPE IQ3_M to have attn.output.tensor in Q4_K (4.5BPW).
In terms of perplexity, on a Llama 3.1 70b model, the proposed change reduces the size by 1%, and increases the preplexity by 0.25%.
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -15486,7 +15486,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ) new_type = GGML_TYPE_Q4_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ) new_type = GGML_TYPE_IQ4_XS;
             }
         } else {
             if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;

Original file line number	Diff line number	Diff line change
`@@ -15486,7 +15486,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n`
`15486`	`15486`	`else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;`
`15487`	`15487`	`else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;`
`15488`	`15488`	`else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;`
`15489`		`- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;`
	`15489`	`+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_XS;`
`15490`	`15490`	`}`
`15491`	`15491`	`} else {`
`15492`	`15492`	`if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;`