@@ -15330,13 +15330,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1533015330 if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
1533115331 new_type = qs.params->token_embedding_type;
1533215332 } else {
15333- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
15334- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15335- new_type = GGML_TYPE_Q2_K;
15336- }
15337- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15338- new_type = GGML_TYPE_IQ3_S;
15339- }
15333+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_S;
15334+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_Q2_K;
15335+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
15336+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
1534015337 else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
1534115338 new_type = GGML_TYPE_Q4_0;
1534215339 }
@@ -15384,7 +15381,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1538415381 new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1538515382 }
1538615383 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
15387- else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
15384+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) &&
15385+ (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1538815386 new_type = GGML_TYPE_Q5_K;
1538915387 }
1539015388 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@@ -15445,20 +15443,28 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1544515443 int i_layer = info.first, n_layer = info.second;
1544615444 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
1544715445 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
15448- if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K ;
15446+ if (use_more_bits( i_layer, n_layer)) new_type = GGML_TYPE_Q3_K ;
1544915447 }
15450- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15451- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15452- if (qs.i_ffn_down < qs.n_ffn_down/8) {
15453- new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
15454- }
15455- ++qs.i_ffn_down;
15448+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15449+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS;
15450+ }
15451+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
15452+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XS;
15453+ }
15454+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
15455+ if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S;
15456+ }
15457+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
15458+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_S;
15459+ }
15460+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
15461+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ3_XXS;
1545615462 }
1545715463 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
1545815464 new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1545915465 }
1546015466 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15461- new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
15467+ new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K
1546215468 : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
1546315469 : GGML_TYPE_Q3_K;
1546415470 }
@@ -15505,7 +15511,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1550515511 if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
1550615512 else {
1550715513 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
15508- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
15514+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
15515+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
1550915516 }
1551015517 } else {
1551115518 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
@@ -15522,7 +15529,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1552215529 if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
1552315530 new_type = GGML_TYPE_Q4_K;
1552415531 }
15532+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
15533+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_M;
15534+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
15535+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS;
15536+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S;
15537+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ3_XXS;
15538+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
1552515539 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
15540+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
1552615541 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
1552715542 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
1552815543 }
@@ -15532,6 +15547,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1553215547 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
1553315548 new_type = GGML_TYPE_IQ3_XXS;
1553415549 }
15550+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
15551+ if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S;
15552+ }
15553+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
15554+ if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ3_XXS;
15555+ }
1553515556 ++qs.i_ffn_gate;
1553615557 }
1553715558 else if (name.find("ffn_up") != std::string::npos) {
@@ -15540,6 +15561,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1554015561 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
1554115562 new_type = GGML_TYPE_IQ3_XXS;
1554215563 }
15564+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
15565+ if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S;
15566+ }
15567+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
15568+ if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ3_XXS;
15569+ }
1554315570 ++qs.i_ffn_up;
1554415571 }
1554515572
@@ -15976,8 +16003,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1597616003 }
1597716004 if ((new_type == GGML_TYPE_IQ2_XXS ||
1597816005 new_type == GGML_TYPE_IQ2_XS ||
15979- new_type == GGML_TYPE_IQ2_S ||
1598016006 new_type == GGML_TYPE_IQ1_S ||
16007+ (new_type == GGML_TYPE_IQ2_S && strcmp(tensor->name, "token_embd.weight")) ||
1598116008 (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
1598216009 (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
1598316010 LLAMA_LOG_ERROR("\n\n============================================================\n");
0 commit comments