Squashed commits of NQ PR (partly unpublished) on the 11/11/2024

Nexesenex · Nexesenex · commit f9ff58264531 · 2024-11-11T09:34:49.000+01:00
commit bb2a4d125e33148d2b4e9363bf8ace14f722a610 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Nov 11 08:59:32 2024 +0100 8x22b commit 9d4926ff9559ecae25f19fadcb55586677575b61 Merge: 9c65f44 b0cefea Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Nov 11 08:59:07 2024 +0100 Merge branch 'master' into Nexes_CQ30 commit 9c65f44 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Nov 3 04:30:14 2024 +0100 Test base 2048 commit 8ccafe8 Merge: d0d276f 9830b69 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Nov 3 04:28:33 2024 +0100 Merge branch 'master' into Nexes_CQ30 commit d0d276f Merge: 7cecefd 418f5ee Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Nov 1 20:18:27 2024 +0100 Merge branch 'master' into Nexes_CQ30 commit 7cecefd Merge: a5303b7 8841ce3 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Oct 28 06:45:16 2024 +0100 Merge branch 'master' into Nexes_CQ30 commit a5303b7 Merge: f21ab1e 167a515 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu Oct 24 19:55:36 2024 +0200 Merge branch 'master' into Nexes_CQ30 commit f21ab1e Merge: c72289e 20011f1 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Oct 23 20:26:42 2024 +0200 Merge branch 'gg/default-kq-f32-prec' into Nexes_CQ20 commit c72289e Merge: eaee12e 190a37d Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Oct 23 20:26:34 2024 +0200 Merge branch 'master' into Nexes_CQ20 commit 20011f1 Author: Georgi Gerganov <ggerganov@gmail.com> Date: Wed Oct 23 14:32:27 2024 +0300 llama : switch KQ multiplication to use F32 precision by default ggml-ci commit eaee12e Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Oct 21 15:41:24 2024 +0200 EXL SXL and UXL types to test the new bits formula commit 6abef2a Merge: aa73a4e d5ebd79 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Oct 21 15:40:22 2024 +0200 Merge branch 'master' into Nexes_CQ20 commit aa73a4e Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 19 19:04:33 2024 +0200 use_some_bits and use_most_bits commit 7794c8f Merge: 1cf274d cda0e4b Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 19 19:04:05 2024 +0200 Merge branch 'master' into Nexes_CQ20 commit 1cf274d Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Oct 18 21:00:56 2024 +0200 ML UXL and EXL boost commit f105e0f Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Oct 18 21:05:49 2024 +0200 Revert compile for Ampere commit 1b25cbb Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Oct 18 21:05:04 2024 +0200 Delete CMakePresets.json commit 1c440a8 Merge: 366e0c8 afd9909 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Oct 18 20:42:34 2024 +0200 Merge branch 'master' into Nexes_CQ20 commit 366e0c8 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Oct 16 16:57:30 2024 +0200 Fix indent model sizes commit cf8375c Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Oct 16 16:41:57 2024 +0200 continue Q5_K mixes commit 2d052f7 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Oct 15 17:42:48 2024 +0200 difquants three/four eights alt for Mistral Large commit 29cecae Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Oct 15 16:03:12 2024 +0200 Q5_K_XSR, SR, ML, and XL revamp commit 412b56f Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Oct 14 17:08:23 2024 +0200 IQ3_X5L and IQ3_X7L fix for Mistral Large commit ca86ce8 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Oct 14 15:24:37 2024 +0200 Pursue IQ3 revamp commit 6c51f39 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Oct 13 22:22:40 2024 +0200 IQ3_XXXXL, EXL and renaming >=IQ3_ML scheme Test for Mistral Large IQ3_XL = IQ3_X5L and so on. commit 64bfe69 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Oct 13 22:33:05 2024 +0200 Activate F16 commit 575ebc2 Merge: 38229d3 d4c19c0 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Oct 13 22:22:30 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit 38229d3 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 12 20:57:18 2024 +0200 Fix specify tensors in quantize commit b947b6e Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 12 13:38:22 2024 +0200 New FTYPE Q5_K_XL commit ba1b854 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 12 13:36:25 2024 +0200 New FTYPE IQ4_XXSR and beef up attn_k IQ4_XSR commit 79fa98c Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Oct 13 02:00:38 2024 +0200 GGML_MAX_COPIES_1 in CML commit f95ed01 Merge: accd71d edc2656 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Oct 13 02:02:06 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit accd71d Merge: b5103f4 11ac980 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 12 13:23:11 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit b5103f4 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Oct 11 13:43:48 2024 +0200 Better model info (ikawrakow#84) Co-Authored-By: Kawrakow <iwankawrakow@gmail.com> commit b302561 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Oct 11 13:17:39 2024 +0200 IQ3_UXL for test commit 8c6e408 Merge: 66a9b05 7eee341 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Oct 11 13:17:30 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit 66a9b05 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Oct 9 04:30:45 2024 +0200 correct iQ4_LR commit 298990a Merge: f1814f1 dca1d4b Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Oct 8 22:11:53 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit f1814f1 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Oct 7 23:21:56 2024 +0200 Rebump attn_v commit b94a9b0 Merge: 18677c8 6374743 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Oct 7 23:21:38 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit 18677c8 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Oct 6 02:12:09 2024 +0200 IQ4_LR commit a2500c1 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Oct 6 02:12:55 2024 +0200 Crack down fallback GGML_types commit 75b8800 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 5 23:18:02 2024 +0200 More overhaul for IQ4_XSR and new IQ4_MR commit 167a3c5 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 5 17:17:50 2024 +0200 GGML SCHED MAX COPIES 1 commit 8433050 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 5 17:14:39 2024 +0200 Adapt CML commit 1e0f64e Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 5 17:07:07 2024 +0200 Compile for Ampere commit 35ce3f6 Merge: 6480054 8c475b9 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Oct 5 17:03:34 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit 6480054 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Oct 4 18:21:54 2024 +0200 IQ4_XSR revamp commit 1ec8328 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 19 17:00:34 2024 +0200 Clarify PPL result commit de50e13 Merge: ed67589 d5ed2b9 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu Oct 3 22:23:08 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit ed67589 Merge: 06ab3a2 70392f1 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Sep 24 10:22:50 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit 06ab3a2 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Sep 24 10:22:46 2024 +0200 More size logging commit 9d97928 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Sep 24 10:21:25 2024 +0200 Update llama.cpp commit 700d205 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Sep 24 03:51:26 2024 +0200 IQ3_XS more commit da840a3 Merge: 056c47d 116efee Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Sep 24 03:30:18 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit 056c47d Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Sep 24 03:30:15 2024 +0200 Reapply "threadpool : skip polling for unused threads (ggml-org#9461)" This reverts commit 2a8dbf8. commit 8d789ac Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Sep 24 03:20:58 2024 +0200 IQ3_XS commit 413fc43 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Sep 23 19:34:45 2024 +0200 Fix IQ3 <=M commit 9ed3522 Merge: 2a8dbf8 1d48e98 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Sep 23 18:50:43 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit 2a8dbf8 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Sep 22 02:48:50 2024 +0200 Revert "threadpool : skip polling for unused threads (ggml-org#9461)" This reverts commit 0226613. commit 6faac9f Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Sep 22 02:46:37 2024 +0200 Revert "Update CUDA graph on scale change plus clear nodes/params (ggml-org#9550)" This reverts commit 41f4778. commit f377f88 Merge: e3ec684 d09770c Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Sep 21 17:25:04 2024 +0200 Merge branch 'master' into Nexes_CQ_10 commit e3ec684 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Sep 20 06:36:47 2024 +0200 reinsert cqs commit d48aad3 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Sep 2 05:50:08 2024 +0200 Play with IQ3 quants commit 5af6481 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Sep 2 01:41:19 2024 +0200 IQ4_XSR_rework commit dd770d2 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 31 17:05:00 2024 +0200 refine IQ3 quants commit 32ce04a Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 31 14:22:00 2024 +0200 Use of vocab as difquant criteria The pre-vocab>128k models are more sensitive to ffn_down quant than to ffn_gate and up. commit 86a7e4a Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 30 12:15:54 2024 +0200 IQ3_UXL commit 97fbd74 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu Aug 29 22:40:32 2024 +0200 New difquant seven_eights commit c6732bf Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Aug 28 16:06:38 2024 +0200 Bump a bit output for big models in IQ2 and IQ3 commit cce61d3 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Aug 28 13:00:53 2024 +0200 Difquant attn_q and attn_o for IQ3_XXS, XS, and S And also establishing a bump to difquant_first_last_tensors for attn_k and attn_v commit 1e7e816 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Aug 28 02:24:55 2024 +0200 Add IQ3_ML, reinstate IQ3_XXXL commit 7b0dc30 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Aug 28 00:52:45 2024 +0200 Bump IQ3_XS commit 6263649 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Aug 27 16:19:10 2024 +0200 Revert variable V below Q5_K commit eb4a69e Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Aug 27 13:26:15 2024 +0200 Difquant for IQ2_XL & IQ3 for attn_k and attn_v And prepare difquant for these quants for attn_o and attn_q commit c84d981 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Aug 27 06:13:39 2024 +0200 correct settings commit c667f2e Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 26 23:05:04 2024 +0200 Temporary settings for IQ3 attn_k and attn_v commit 294aeec Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 26 18:18:05 2024 +0200 Corrections and clean-up Back to Q8_0 for attn_k and attn_v if 8 experts or more. for attn_v and attn_k if experts>=4 GQA>=12 brought back to expert>=4 quant level instead of 8 GQA8 brought to GQA7, and GQA7 brought to GQA4. commit e7c5163 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 26 14:33:34 2024 +0200 Shrink a bit Q2_K when GQA<2 and optimize difquants_first_last and fl_more commit ff48606 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 26 14:02:09 2024 +0200 IQI_XL, IQ2_S, IQ2_XS enhanced commit 8a1ab24 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 26 12:57:21 2024 +0200 IQ1_XS, IQ1_S, IQ1_M, IQ2_XXS, Q2_M, Q2_K enhanced testing templates for other quants. commit 26aac8e Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 25 14:42:33 2024 +0200 Soften the token embeddings bump for experts >= 4 commit 5644d4c Merge: 16aee45 6026da5 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Sep 20 01:38:20 2024 +0200 Merge branch 'master' into pr/8836 commit 16aee45 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 25 14:25:46 2024 +0200 correction commit dd3df75 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 25 03:30:36 2024 +0200 Bad indents and trailing whitespaces commit f63860e Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 25 03:17:21 2024 +0200 Put back ffn_down tree where it was before. commit 8fc46df Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 24 22:30:45 2024 +0200 Bump a bit ffn_gate and down for some GQA<2 models commit 53b8eaa Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 24 21:57:07 2024 +0200 Remove deprecated rules for token embeddings commit 844d11b Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 24 21:02:51 2024 +0200 bad indent commit 5ae5971 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 24 20:50:07 2024 +0200 Revamp Q2_K and Q3_K quants Q3_K_XL takes the place of Q3_K_L. Q3_K_L becomes intermediary between Q3_K_M and XL. commit 1bde168 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 23 23:27:26 2024 +0200 Usage of n_head to discriminate very small models Of which the size is more sensitive to the non repeating tensors commit 16e9c37 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 23 23:18:59 2024 +0200 various corrections on IQ2_S+ and IQ3 quants commit 380b53d Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 23 21:59:34 2024 +0200 Fix IQ4_XSR commit 6081085 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 23 17:48:31 2024 +0200 Ravamp attn_output commit 6b5cebf Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 23 16:40:40 2024 +0200 Revamp a bit output weight for more granularity in low quants. commit f796954 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 23 14:17:19 2024 +0200 Revamp FFN down and attn_k And complete FFN up Shrink a bit more non GQA models commit 596a4ae Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu Aug 22 19:12:25 2024 +0200 Readd variable attn_k, attn_q, attn_o after merge commit fb2b9ea Merge: 3a027b8 e11bd85 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 25 02:59:57 2024 +0200 Merge branch 'master' into pr/8836 commit 3a027b8 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 23 00:08:42 2024 +0200 Revamp IQ4_XSR, remove IQ3_XXXL commit e05da54 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu Aug 22 19:12:13 2024 +0200 Overhaul of FFN, if GQA and if not commit 1607a02 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 23 12:38:45 2024 +0200 Further adjustments difquant formulas commit 179ad0f Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Aug 21 13:10:54 2024 +0200 Little rework of the difquant formulas commit 644aa9f Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Aug 21 13:07:32 2024 +0200 Correction too small tensor embeddings to quantize IQ2_XS doesn't seem to work as such, back to IQ2_S commit 32f6ead Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 19 17:58:12 2024 +0200 Improve IQ1 and IQ2 quants And fix mistakes for the attn.output of IQ2_XL and the ffn gate and up of IQ2_XS Reformat attn_ouput mess and split GQA4/GQA2 commit d7b9d21 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Aug 20 12:45:30 2024 +0200 Shrink a bit IQ3_XXS, bump a bit IQ3_M commit dbadcdd Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Aug 20 11:59:41 2024 +0200 harmonize formatting of tensor type conditions commit ce86019 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Aug 21 12:25:38 2024 +0200 change function use_*_bits into difquant_*_tensors this to clarify what it does, especially with the 5 additional levels of difquant commit cfe866e Merge: fddff02 fc54ef0 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed Aug 21 12:23:41 2024 +0200 Merge branch 'master' into pr/8836 commit fddff02 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 19 01:43:31 2024 +0200 Rework IQ3_XXS and IQ3_XS and fix parenthesis mistake on IQ3_S commit 207ffe6 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 18 23:28:13 2024 +0200 Reorder, corrections, settling lower IQ3 quants commit 8c1a3c5 Merge: a7f9164 cfac111 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue Aug 20 00:48:05 2024 +0200 Merge branch 'master' into pr/8836 commit a7f9164 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 19 16:02:00 2024 +0200 Fix mistake commit caeb839 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 18 17:58:17 2024 +0200 Boost embeddings and output weights for MOEs. They are single and non-repeating, the boost is thus reasonable compared to the 4 or more experts size. commit 503048a Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 18 17:44:11 2024 +0200 Correct IQ3_M commit ddb1373 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 18 16:56:55 2024 +0200 IQ3_XXL and IQ3_XXXL We now have a full range of quants between IQ3_M and IQ4_XS commit a79633b Merge: b02eaf6 554b049 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 18 22:12:39 2024 +0200 Merge branch 'master' into pr/8836 commit b02eaf6 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 17 14:58:25 2024 +0200 Mass use of the few/some/more/many bits bump logic Add few bits logic and rework the 4 settings for 25/37.5/50/75% quant bump when used. commit 4ba5618 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 17 12:31:36 2024 +0200 Adapt token embeddings and output.weight to vocab size due to the huge increase of the embeddings and output weight size for models with huge vocab, they seem to quantize with less loss. commit 17b7151 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 17 00:17:41 2024 +0200 Update IQ3_M attn_k and IQ3_XL token_embd commit e4c506d Merge: eeccd31 2fb9267 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 18 04:09:22 2024 +0200 Merge branch 'master' into pr/8836 commit eeccd31 Merge: 8c9017b 5fd89a7 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu Aug 15 02:30:10 2024 +0200 Merge branch 'master' into pr/8836 commit 8c9017b Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 12 22:20:02 2024 +0200 Simplify IQ4_XSR But leave in place as a "demo" the more complex template set by Ikawrakow to customize the layers quants, with the added attn_q, attn_k, and attn_output tensors. commit 8c10533 Merge: cd92ba6 fc4ca27 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 12 20:28:38 2024 +0200 Merge branch 'master' into pr/8836 commit cd92ba6 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 12 19:45:46 2024 +0200 IQ4_XSR (test FTYPE) and attention_wv logic for all attn_*.weights Also, Advise iMatrix for IQ2_M and Q2_K FTypes commit 3e2eb6d Merge: df9e6fd df5478f Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon Aug 12 14:25:23 2024 +0200 Merge branch 'master' into pr/8836 commit df9e6fd Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 11 21:49:23 2024 +0200 Adjustments on output and embeddings commit 1ad18f8 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 11 21:44:29 2024 +0200 Adjustments on attn_k commit 8c2c03f Merge: 91db53b 8cd1bcf Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 11 16:46:15 2024 +0200 Merge b3569 b3569 commit 91db53b Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 11 16:41:23 2024 +0200 IQ1_XL and some corrections notably on attn_q and parenthesis commit 1268d58 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 11 02:13:08 2024 +0200 More adjustments commit ef83a87 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 11 01:30:18 2024 +0200 Revert of ffn gate and up on IQ3_M and indent commit e2e2d77 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 11 01:13:12 2024 +0200 misplaced file lol commit 8ad71f4 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 11 01:11:24 2024 +0200 IQ1_XS and small adjustments. commit 14f4f40 Merge: 8bc7a98 6e02327 Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 10 20:45:26 2024 +0200 Merge b3565 Merge b3565 commit 8bc7a98 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 10 20:40:27 2024 +0200 2 forgotten files commit f0806ac Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 10 20:34:17 2024 +0200 IQ2_XL , IQ3_XL , Q2_K_L Plus some adjustments on the FFNs commit 49617b1 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 10 18:37:29 2024 +0200 Advancing on several tensors - Progressivity for token embeddings and attn_qkv - FFN down for IQ1 and IQ2 quants - FFN gate and up for IQ2_S and IQ2_M, for progressivity in the IQ2 range. commit 415d5e4 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 10 17:32:29 2024 +0200 Refactor furthermore attn.v And also lower attn_q for IQ2_XS, in order to separate it more for the quite misnamed IQ2_S commit 8c8e43c Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 10 16:38:11 2024 +0200 Settings for MOE >= 8 experts applied to >= 4 experts commit aa4eb59 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 10 16:33:55 2024 +0200 Further refactor attn_k With attn_k set for all quants bellow 3bpw except Q2_K_S. commit 8f1b99f Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 10 13:09:11 2024 +0200 Shortening formatting commit 7212098 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 10 12:52:57 2024 +0200 IQ1 and IQ2 refactor Attn_q in Q3_K for experts >= 8 Attn_k in Q5_K for experts >= 8 Attn_v in Q6_K for experts >= 8, in IQ3_XXS for IQ2_XXS and IQ2_XS Attn_output in Q4_K for experts >= 8 commit 1bc4dc5 Author: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 9 22:49:42 2024 +0200 Bump IQ3_M attn.v in Q5_K attn.k in IQ4_XS commit 1118c04 Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Thu Aug 8 18:56:20 2024 +0200 correct mistake in conditionality for attn.k commit 8006b15 Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Thu Aug 8 18:50:48 2024 +0200 Avoid to shrink attn.k.weight for IQ3_XS and XXS when GQA or MOE commit 59c5d47 Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 4 12:06:06 2024 +0200 attn_qkv.weight in IQ4_XS for FTYPE IQ3_M If FTYPE IQ4_XS has attn_qkv.weight in IQ4_XS, then FTYPE IQ3_M should not have it in Q4_K (4.5BPW), but in IQ4_XS (4.25BPW) also. commit 93c35f8 Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sun Aug 4 11:59:52 2024 +0200 attn.output.tensor of FYPE IQ3_M in IQ4_XS If FTYPE IQ4_XS has attn.output.tensor in IQ4_XS (4.5BPW), there's no reason to have FTYPE IQ3_M to have attn.output.tensor in Q4_K (4.5BPW). In terms of perplexity, on a Llama 3.1 70b model, the proposed change reduces the size by 1%, and increases the preplexity by 0.25%. commit d5779c2 Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 3 03:04:25 2024 +0200 More occurences of n_experts == 8 changed to >= in quant strategies commit 7d337d0 Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sat Aug 3 01:35:08 2024 +0200 Slight reorder of the attn.weight tree And application of the attn.v.weight logic I used for IQ2 and IQ3, but only when such logic is already implied by the existing quant strategies, as a compromise to not disturb too much Ikawrakow's quant strategies. commit 6398663 Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 2 23:49:03 2024 +0200 Apply the GQA2/Expert2 conditionality to the IQ3 quants In coherence with the proposed modifications to the IQ2 quant strategies, which make even more sense for the IQ3 quant strategies. commit b77cdd8 Author: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Fri Aug 2 20:40:04 2024 +0200 Small changes for IQ2 quant strategies (notably IQ2_S and IQ2_M) Here's a few edits I consider useful to improve a bit the IQ2 model quant strategies for some models: - The tensor attn.v.weight passed in Q4_K for models like Gemma (GQA 2), and the various franken MOEs having 2 experts, this to not sabotage them with a too small value head quant (Q2_K is meh for such important head) while the size of that head is low relatively to the total size of the affected models. - The tensor attn.k.weight passed in Q4_K for models with 8 experts or more, rather than simply 8 experts. - The tensor attn.output.weight passed in IQ3_XXS (instead of IQ3_S) for the quant strategies IQ2_S and IQ2_M, this to have a progressiveness between the IQ2_XS quant strategies (which use IQ2_XS for the attn.output.weight) and the IQ3_XXS quant strategies (which use.. IQ3_S quant for attn.output.weight). The benefit of an IQ3_S quant instead of an IQ3_XXS for that tensor is quasi-inexistant on IQ2_S and IQ2_M quant strategies, especially compared to the size bump it provokes. More broadly, I think that the whole IQ2 quant strategies bunch should be harmonized/refactored like the rest of the quant strategies are established (tensor by tensor), rather than under an different kind of tree mixing these 5 quant strategies. I'm using these settings (and many more edits) for a long time, with benefit, and I think they could be standard.
diff --git a/common/common.h b/common/common.h
@@ -75,7 +75,7 @@ struct gpt_params {
     int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_threads_batch_draft =    -1;
     int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 =     0; // context size
+    int32_t n_ctx                 =  2048; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                =     0; // number of tokens to keep from initial prompt
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -675,7 +675,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     nll2 -= nll * nll;
     if (nll2 > 0) {
         nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        printf("Final estimate: PPL over %d chunks for n_ctx=%d = %.4lf +/- %.5lf\n", n_chunk, n_ctx, ppl, nll2*ppl);	
     } else {
         printf("Unexpected negative standard deviation of log(prob)\n");
     }
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -25,8 +25,11 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ2_XS",   LLAMA_FTYPE_MOSTLY_IQ2_XS,   " 2.31 bpw quantization",            },
     { "IQ2_S",    LLAMA_FTYPE_MOSTLY_IQ2_S,    " 2.5  bpw quantization",            },
     { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
+    { "IQ2_XL",   LLAMA_FTYPE_MOSTLY_IQ2_XL,   " 2.85 bpw quantization mix",        },
+    { "IQ1_XS",   LLAMA_FTYPE_MOSTLY_IQ1_XS,   " 1.6-1.7 bpw quantization mix",     },
     { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
     { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
+
     { "IQ1_BN",   LLAMA_FTYPE_MOSTLY_IQ1_BN,   " 1.62 bpw quantization (Bitnet)",   },
     { "IQ2_BN",   LLAMA_FTYPE_MOSTLY_IQ2_BN,   " 2.00 bpw quantization (Bitnet)",   },
     { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
@@ -58,6 +61,50 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
     { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
     { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
+
+    { "IQ1_XL",   LLAMA_FTYPE_MOSTLY_IQ1_XL,   " 1.90 bpw quantization",            },
+    { "TQ1_0",    LLAMA_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
+    { "TQ2_0",    LLAMA_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
+    { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
+    { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
+    { "Q2_K_L",   LLAMA_FTYPE_MOSTLY_Q2_K_L,   " 3.20G, +3.1836 ppl @ Llama-3-8B",  },
+    { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
+    { "IQ3_S",    LLAMA_FTYPE_MOSTLY_IQ3_S,    " 3.44 bpw quantization",            },
+    { "IQ3_S2L",  LLAMA_FTYPE_MOSTLY_IQ3_S2L,  " 3.55 bpw quantization",            },
+    { "IQ3_M",    LLAMA_FTYPE_MOSTLY_IQ3_M,    " 3.60 bpw quantization mix",        },
+    { "IQ3_M3L",  LLAMA_FTYPE_MOSTLY_IQ3_M3L,  " 3.70 bpw quantization mix",        },
+    { "IQ3_X4L",  LLAMA_FTYPE_MOSTLY_IQ3_X4L,  " 3.80 bpw quantization mix",        },
+    { "IQ3_X5L",  LLAMA_FTYPE_MOSTLY_IQ3_X5L,  " 3.90 bpw quantization mix",        },
+    { "IQ3_X6L",  LLAMA_FTYPE_MOSTLY_IQ3_X6L,  " 4.00 bpw quantization mix",        },
+    { "IQ3_X7L",  LLAMA_FTYPE_MOSTLY_IQ3_X7L,  " 4.10 bpw quantization mix",        },
+    { "IQ3_EXL",  LLAMA_FTYPE_MOSTLY_IQ3_EXL,  " 3.75 bpw quantization mix",        },
+    { "IQ3_SXL",  LLAMA_FTYPE_MOSTLY_IQ3_SXL,  " 3.90 bpw quantization mix",        },
+    { "IQ3_UXL",  LLAMA_FTYPE_MOSTLY_IQ3_UXL,  " 4.05 bpw quantization mix",        },
+    { "Q3_K",     LLAMA_FTYPE_MOSTLY_Q3_K_M,   "alias for Q3_K_M"                   },
+    { "IQ3_XS",   LLAMA_FTYPE_MOSTLY_IQ3_XS,   " 3.3 bpw quantization",             },
+    { "Q3_K_S",   LLAMA_FTYPE_MOSTLY_Q3_K_S,   " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
+    { "Q3_K_M",   LLAMA_FTYPE_MOSTLY_Q3_K_M,   " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
+    { "Q3_K_L",   LLAMA_FTYPE_MOSTLY_Q3_K_L,   " 4.10 bpw quantization mix",        },
+    { "Q3_K_XL",  LLAMA_FTYPE_MOSTLY_Q3_K_XL,  " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
+    { "IQ4_NL",   LLAMA_FTYPE_MOSTLY_IQ4_NL,   " 4.1x bpw non-linear quantization", },
+    { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
+    { "IQ4_XXSR", LLAMA_FTYPE_MOSTLY_IQ4_XXSR, " 4.xx bpw non-linear quantization", },
+    { "IQ4_XSR",  LLAMA_FTYPE_MOSTLY_IQ4_XSR,  " 4.xx bpw non-linear quantization", },
+    { "IQ4_MR",   LLAMA_FTYPE_MOSTLY_IQ4_MR,   " 4.xx bpw non-linear quantization", },
+    { "IQ4_LR",   LLAMA_FTYPE_MOSTLY_IQ4_LR,   " 4.xx bpw non-linear quantization", },
+    { "Q4_K",     LLAMA_FTYPE_MOSTLY_Q4_K_M,   "alias for Q4_K_M",                  },
+    { "Q4_K_S",   LLAMA_FTYPE_MOSTLY_Q4_K_S,   " 4.37G, +0.2689 ppl @ Llama-3-8B",  },
+    { "Q4_K_M",   LLAMA_FTYPE_MOSTLY_Q4_K_M,   " 4.58G, +0.1754 ppl @ Llama-3-8B",  },
+    { "Q5_K",     LLAMA_FTYPE_MOSTLY_Q5_K_M,   "alias for Q5_K_M",                  },
+    { "Q5_K_S",   LLAMA_FTYPE_MOSTLY_Q5_K_S,   " 5.21G, +0.1049 ppl @ Llama-3-8B",  },
+    { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
+    { "Q5_K_XS1R", LLAMA_FTYPE_MOSTLY_Q5_K_XS1R, " 5.4 bpw quantization mix",       },
+    { "Q5_K_S2R",  LLAMA_FTYPE_MOSTLY_Q5_K_S2R,  " 5.6 bpw quantization mix",       },
+    { "Q5_K_M3L",  LLAMA_FTYPE_MOSTLY_Q5_K_M3L,  " 5.8 bpw quantization mix",       },
+    { "Q5_K_X4L",  LLAMA_FTYPE_MOSTLY_Q5_K_X4L,  " 6 bpw quantization mix",         },
+    { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
+    { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+
     { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
     { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
     { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
@@ -339,6 +386,54 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--attn-q-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_q_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attn-k-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_k_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attn-v-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_v_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attn-qkv-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_qkv_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attn-output-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_output_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--ffn-gate-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_gate_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--ffn-down-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_down_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--ffn-up-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_up_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
             if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                 usage(argv[0]);
@@ -482,14 +577,14 @@ int main(int argc, char ** argv) {
     }
 
     if (!params.ignore_imatrix_rules && imatrix_data.empty() &&
-        (params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_M   ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K    ||
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) {
-        fprintf(stderr, "\n==========================================================================================================\n");
-        fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
-        fprintf(stderr, "==========================================================================================================\n\n\n");
+        fprintf(stderr, "\n==========================================================================================\n");
+        fprintf(stderr, "Please do not use IQ1_*, IQ2_*, Q2_K_S, or Q2_K quantization without an importance matrix!\n");
+        fprintf(stderr, "==========================================================================================\n\n\n");
         return 1;
     }
 
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -103,7 +103,7 @@ if (WIN32)
 endif()
 
 # ggml core
-set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
+set(GGML_SCHED_MAX_COPIES  "1" CACHE STRING "ggml: max input copies for pipeline parallelism")
 
 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
@@ -113,18 +113,19 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
 option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             OFF)
 option(GGML_IQK_MUL_MAT                     "ggml: use optimized iqk matrix multiplications"  ON)
 
-option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
+option(GGML_CUDA                            "ggml: use CUDA"                                  ON)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
-option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
+option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         ON)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
 set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
 set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
-option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
+option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   ON)
 set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
                                             "ggml: iters./thread per block for Q2_K/Q6_K")
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "ggml: max. batch size for using peer access")
+set   (GGML_SCHED_MAX_COPIES  "1" CACHE STRING "llama: max input copies for pipeline parallelism")											
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -2,6 +2,7 @@ include(CheckCXXCompilerFlag)
 
 unset(GGML_CDEF_PUBLIC)
 
+add_compile_definitions(LLAMA_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
 add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
 
 # enable libstdc++ assertions for debug builds
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
@@ -1027,7 +1027,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
 #endif
 
 #ifndef GGML_SCHED_MAX_COPIES
-#define GGML_SCHED_MAX_COPIES 4
+#define GGML_SCHED_MAX_COPIES 1
 #endif
 
 struct ggml_backend_sched_split {
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -2484,7 +2484,6 @@ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_p
     for (int i = 0; i < GGML_MAX_SRC; i++) {
         graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
     }
-    memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
 }
 
 static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
@@ -2516,12 +2515,6 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
             return false;
         }
     }
-
-    if (node->op == GGML_OP_SCALE &&
-        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2732,9 +2725,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
             // First call with null argument gets number of nodes in graph
             CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
             // Subsequent call with non-null argument gets nodes
-            cuda_ctx->cuda_graph->nodes.clear();
             cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
-            cuda_ctx->cuda_graph->params.clear();
             cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
             if (cuda_ctx->cuda_graph->num_nodes > 0) {
                 CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -652,7 +652,6 @@ struct ggml_graph_node_properties {
     int64_t ne[GGML_MAX_DIMS];
     size_t nb[GGML_MAX_DIMS];
     void * src_address[GGML_MAX_SRC];
-    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
 };
 
 struct ggml_cuda_graph {
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -1298,6 +1298,7 @@ class LlamaFileType(IntEnum):
     MOSTLY_Q4_0_4_4      = 33  # except 1d tensors
     MOSTLY_Q4_0_4_8      = 34  # except 1d tensors
     MOSTLY_Q4_0_8_8      = 35  # except 1d tensors
+
     MOSTLY_IQ1_BN        = 36, # except 1d tensors
     MOSTLY_IQ2_BN        = 37, # except 1d tensors
     MOSTLY_IQ2_K         = 38, # except 1d tensors
@@ -1308,6 +1309,32 @@ class LlamaFileType(IntEnum):
     MOSTLY_IQ2_TN        = 43, # except 1d tensors
 
 
+    # MOSTLY_TQ1_0         = 36  # except 1d tensors
+    # MOSTLY_TQ2_0         = 37  # except 1d tensors
+    MOSTLY_IQ2_XL        = 100 # except 1d tensors
+    MOSTLY_Q2_K_L        = 101 # except 1d tensors
+    MOSTLY_IQ1_XS        = 102 # except 1d tensors
+    MOSTLY_IQ1_XL        = 103 # except 1d tensors
+    MOSTLY_IQ3_S2L       = 104 # except 1d tensors
+    MOSTLY_IQ3_M3L       = 105 # except 1d tensors
+    MOSTLY_IQ3_X4L       = 106 # except 1d tensors
+    MOSTLY_IQ3_X5L       = 107 # except 1d tensors
+    MOSTLY_IQ3_X6L       = 108 # except 1d tensors
+    MOSTLY_IQ3_X7L       = 109 # except 1d tensors
+    MOSTLY_IQ3_EXL       = 110 # except 1d tensors
+    MOSTLY_IQ3_SXL       = 111 # except 1d tensors
+    MOSTLY_IQ3_UXL       = 112 # except 1d tensors
+    MOSTLY_Q3_K_XL       = 113 # except 1d tensors
+    MOSTLY_IQ4_XXSR      = 114 # except 1d tensors
+    MOSTLY_IQ4_XSR       = 115 # except 1d tensors
+    MOSTLY_IQ4_MR        = 116 # except 1d tensors
+    MOSTLY_IQ4_LR        = 117 # except 1d tensors
+    MOSTLY_Q5_K_XS1R     = 118 # except 1d tensors
+    MOSTLY_Q5_K_S2R      = 119 # except 1d tensors
+    MOSTLY_Q5_K_M3L      = 120 # except 1d tensors
+    MOSTLY_Q5_K_X4L      = 121 # except 1d tensors
+    MOSTLY_CQS           = 199 # except 1d tensors
+
     GUESSED              = 1024  # not specified in the model file
 
 
diff --git a/include/llama.h b/include/llama.h
@@ -169,7 +169,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
-        //
+
         LLAMA_FTYPE_MOSTLY_Q6_0          = 135, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ1_BN        = 136, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ2_BN        = 137, // except 1d tensors
@@ -183,6 +183,32 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ2_KS        = 147, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_KSS       = 148, // except 1d tensors
 
+        // LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_XL        = 100, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K_L        = 101, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_XS        = 102, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_XL        = 103, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_S2L       = 104, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_M3L       = 105, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_X4L       = 106, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_X5L       = 107, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_X6L       = 108, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_X7L       = 109, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_EXL       = 110, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_SXL       = 111, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_UXL       = 112, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_XL       = 113, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ4_XXSR      = 114, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ4_XSR       = 115, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ4_MR        = 116, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ4_LR        = 117, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_XS1R     = 118, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S2R      = 119, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M3L      = 120, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_X4L      = 121, // except 1d tensors
+        LLAMA_FTYPE_CQS                  = 199, // except 1d tensors
+
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
 
diff --git a/src/llama.cpp b/src/llama.cpp

Original file line number	Diff line number	Diff line change
`@@ -675,7 +675,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par`
`675`	`675`	`nll2 -= nll * nll;`
`676`	`676`	`if (nll2 > 0) {`
`677`	`677`	`nll2 = sqrt(nll2/(count-1));`
`678`		`- printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);`
	`678`	`+ printf("Final estimate: PPL over %d chunks for n_ctx=%d = %.4lf +/- %.5lf\n", n_chunk, n_ctx, ppl, nll2*ppl);`
`679`	`679`	`} else {`
`680`	`680`	`printf("Unexpected negative standard deviation of log(prob)\n");`
`681`	`681`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2484,7 +2484,6 @@ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_p`
`2484`	`2484`	`for (int i = 0; i < GGML_MAX_SRC; i++) {`
`2485`	`2485`	`graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;`
`2486`	`2486`	`}`
`2487`		`- memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);`
`2488`	`2487`	`}`
`2489`	`2488`
`2490`	`2489`	`static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {`
`@@ -2516,12 +2515,6 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra`
`2516`	`2515`	`return false;`
`2517`	`2516`	`}`
`2518`	`2517`	`}`
`2519`		`-`
`2520`		`- if (node->op == GGML_OP_SCALE &&`
`2521`		`- memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {`
`2522`		`- return false;`
`2523`		`- }`
`2524`		`-`
`2525`	`2518`	`return true;`
`2526`	`2519`	`}`
`2527`	`2520`
`@@ -2732,9 +2725,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t`
`2732`	`2725`	`// First call with null argument gets number of nodes in graph`
`2733`	`2726`	`CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));`
`2734`	`2727`	`// Subsequent call with non-null argument gets nodes`
`2735`		`- cuda_ctx->cuda_graph->nodes.clear();`
`2736`	`2728`	`cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);`
`2737`		`- cuda_ctx->cuda_graph->params.clear();`
`2738`	`2729`	`cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);`
`2739`	`2730`	`if (cuda_ctx->cuda_graph->num_nodes > 0) {`
`2740`	`2731`	`CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));`