Squashed commit of the following:

commit cf66ef8a27146afe575b64e135d117b212f7bd64 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 31 14:04:42 2024 +0100 Updated tests. commit eae157eaae352ce9b38c268cd647ff5ffa6bdf61 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 31 13:57:45 2024 +0100 Updated changelog. commit 6b2f8ebc1d421cd096ee9df7c5951fc373e54982 Merge: 8666f7e8 0c888e84 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 31 13:23:40 2024 +0100 Merge branch 'master' into dev commit 0c888e84367dfba1d1ab6d23a3aa663ad3c61440 Merge: 19c730d5 56f5d14 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 31 13:23:25 2024 +0100 Merge branch 'master' of https://github.com/bbuchfink/diamond commit 8666f7e853efe8c8a1d7d0bdde46ad810db7475c Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 31 12:34:15 2024 +0100 Fixed output. commit a70a755c04135b310d119e69ab9a899a6ace94b0 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 31 11:09:09 2024 +0100 Changed max block size for vsens mode. commit df9be960f04c8c41b49495746790e44784e7b138 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 16:56:36 2024 +0100 Fixed taxon format. commit 565bd4fd37d4116cb6cbd147eed851fd627c81af Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 16:45:54 2024 +0100 Fixed error. commit 67df3d4af021e7d39fa559064b3122dd825ab96b Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 16:43:01 2024 +0100 Added taxon lineage option. commit e4203e82775fb50789bfe5826b0dc50aa2fc8e67 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 15:57:39 2024 +0100 Added gapped seqs to help. commit 98b64461cae30e158fbd2e27eaf9d00d9fb53a59 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 15:47:50 2024 +0100 Fixed warnings. commit ab40c4a23dc16a11c9af50657ba40bf3ea734f8d Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 15:41:20 2024 +0100 Fixed warnings. commit d9dc1ae43167fb8f496b97742e021b0300b24526 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 15:26:53 2024 +0100 Fixed warnings. commit ad6c31999575a2f0b3e84192a91a73fa0eab473e Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 15:02:00 2024 +0100 Updated version. commit 30f509a8476c8b1ed4dd1497244da554301412ae Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 14:39:42 2024 +0100 Added shapes-30x10 mode. commit dc64958415be88c5ecfbb4b47e48ad5c34f78e4d Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 14:29:17 2024 +0100 Add per round cutoff options. commit 4193abd0cf7da091bd25b794c278577075c0b9f8 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 13:43:06 2024 +0100 Added setting ccd per round. commit d78caa14b78cd8f690a02df08ef7951934486e41 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jan 30 12:59:40 2024 +0100 Added shapes30x10 mode. commit 30221fa8e1b1564fc28ab669a4f08a89da45b44b Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Jan 29 16:16:44 2024 +0100 Removed famsa, incremental clustering. commit ba15651e286bfc4460343faeb5b08dd98bc97176 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Jan 29 15:58:38 2024 +0100 Update reassign for mutual cover. commit cf5a84ec02058870e0c42020c66dcf423baf4383 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Jan 29 15:30:01 2024 +0100 Updated recluster for mutual coverage. commit e6032ba102c805d565096b4cfc1e0c0d34ad9168 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Jan 26 17:52:49 2024 +0100 Update recluster for mutual cover. commit bdee2616dfbe9195841bac59438d5cc44733a5d4 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Jan 26 16:37:24 2024 +0100 Fixed issue with length lookup. commit 739be0ea92aee7d32837ab39c70efc2a792f37aa Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Jan 26 11:20:14 2024 +0100 Fixed issue with round values. commit 34f0b3a73204117b41efc20883fa583e3a89339a Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Jan 25 13:28:44 2024 +0100 Reworked storing target ids in hits logic. commit 76e17c7443d4362bd821f855a2e8dc8598c67710 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 24 18:00:39 2024 +0100 Rework target ids in seed array. commit 31f0f34a3c06e52c9a87f565260375376601553e Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 24 15:08:28 2024 +0100 Fixed error. commit 69680a20ad46fbc9618d5fca59cb7b76104946ca Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 24 13:20:32 2024 +0100 Use ccd=0 in last round. commit a428f0d9dfcd3225ec34bb358984ac1c6a2e4cd0 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 24 13:07:51 2024 +0100 Added round approx id setting. commit 261cf75522b9a703644329bfb24164d53d126e84 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jan 24 12:49:19 2024 +0100 Fixed error. commit 1f3f8a7acd480f4814f5a0f9eaee5b17630c7b11 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Jan 22 17:00:42 2024 +0100 Rework round coverage option commit fb55d8eba76e8726b241d3f4fee81e73922af753 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Jan 22 15:55:01 2024 +0100 Apply evalue max in cascaded clustering. commit ab498d0bbffe523b4e3087149cd7b6be5c29b096 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Jan 22 15:25:31 2024 +0100 Apply coverage increment. commit f5dd1367549c7932185c78218b0eb6c4618e7e01 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Jan 19 16:49:23 2024 +0100 Fixed evalue calculation. commit 87aff2d0eab54fc4440a5046f8d06f12440fec1b Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Jan 12 12:26:15 2024 +0100 Added shapes. commit a49026b0319b04675ac3fa65c851bb116f792c43 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Jan 11 14:52:17 2024 +0100 Added new weight 10 modes. commit 590e1756fd198b288b4010414010d8b9baed089a Author: Vincent Spath <90271444+v-spath@users.noreply.github.com> Date: Wed Nov 29 17:24:01 2023 +0100 DNA only mapping (#18) * seed repetition cutoff * remove overhead * back to array * test * repetitive filter * changed float to double for parser * fixed count typo * removed io header * min heap seed filtering * fix heap * filter multithreaded * multithreaded optimized * locked guard * test new multithreading filter * filter multithreaded optimized * pass thread index during creation of thread * check smallest element before adding to queue * remove unordered set, replace by checking against cutoff value * refactor * new heuristics * add ungapped extension to set * new timer * temp * new timer * ssh * d * Delete Arabidopsis_thaliana.TAIR10.dna_sm.toplevel.fa.gz * new timer * timer * remove timer * temp * Delete Arabidopsis_thaliana.TAIR10.dna_sm.toplevel.fa.gz * temp * Delete Arabidopsis_thaliana.TAIR10.dna.chromosome.Pt.fa.gz * temp * Delete Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz * refactor * seed filter * target filter * target filter * target filter * revert * r_ungapped to score * fix * wfadaptive * setheuristicnone * remove ungapped eval * repetition cutoff 0 * fix filter for small index * backup chaining and apple specifics * revert * test ksw2 for mac * chaining and bug fixes and apple specifics * backtracking * backup chaining, with old out commented code, 1 bug left * working_chaining * add dna chaining condition * basic chaining * Delete src/align/align.cpp.save * repetition-cutoff into WITH_DNA * Delete libwfa2cpp.a * add fast approximation of log2 * Delete setup.cpp.save * delete binary obj * remove key in RepetitiveCounter * removed vector of priority_queues as class member * Update dna_index.cpp check size repetitive before adding to queue * extended struct SeedMatch * fixed bug 100 % ungapped score * struct for data an chain * removed std output * update chain.cpp * change built queue * add sorting before chaining * chaining only for one target at a time * head * comments * include wfa2 in comment * include wfa comment * Update log2_fast.h * chain_end two parameters * fixed bug backtrack * make variables const if possible * parameters struct * struct anchor data * update documentation * backtrack simplification * update backtrack * delete wrong commit * map hsp * remove RepetitiveCounter struct * updated chain struct * construction Chain Object * chain Standard Komparator * use iterator * mapping * correction syntax * update indices * mapping quality * fixed index t_id * mapping output includes n_anchors (cm) * remove redundancy * fixed bug match build * primary computation for all targets * overlap percentage of shorter chain * fix compile error * dna extension * with dna compile * debug test * Update extension.cpp * fix segmentation fault * test why slow * test only map best * test map percentage * test 1 * sort score * todo * move semantics * forward/reverse together * test seed lookup * Update extension.cpp * add chaining penalties * new primary chain computation * Update extension.cpp * Update extension.cpp * simplify chain structure * use PAF when only mapping * Update config.cpp * compile error on mac * Merge branch 'personal_dev' into dev # Conflicts: # src/basic/config.cpp # src/run/config.cpp # src/run/double_indexed.cpp # src/search/stage0.cpp * Todo: align long reads * basic chaining alignment * fix bug short reads reverse (now correct) * chain alignment * integrate ksw2 correctly * update WFA * corrected anchor alignment * fixed residue matches mapping * change semantics * refacator chain alignment * integrate WFA * correct primary computation * Update extension.h * chaining penalties scaling factors * correction chain alignment * compute correct alignment score * correct paf format DNA * clean up includes * filter chains * comments * wfa low memory mode * filter chains with lower bound --------- Co-authored-by: Dimi <dimitrios_K@gmx.de> Co-authored-by: Dimitrios Koutsogiannis <dkoutso@taco.eb.local> Co-authored-by: Dimi99 <73211787+Dimi99@users.noreply.github.com> Co-authored-by: vinceaps <90271444+vinceaps@users.noreply.github.com> Co-authored-by: Benjamin Buchfink <buchfink@gmail.com> commit b8f12f41864bc2c2795bcf9ed22b867d717bc5da Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Nov 22 10:52:38 2023 +0100 Add cmake flag for famsa. commit 7fdecdb2879997b6b6a1079ed28ee2ecacf25229 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Nov 15 16:52:41 2023 +0100 Added hit culling. commit e3eadb911af4df39aa08bf0f560ec1068d4f5211 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Nov 9 17:02:54 2023 +0100 Fixed non x86 compile. commit b258c2f5be5421c5c80e44b77b4870d24ac6c603 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Nov 9 16:58:40 2023 +0100 Fixed non x86 compile. commit e055ce95c60e769da4f496232108b3968d0c4bf9 Author: Vincent Spath <90271444+v-spath@users.noreply.github.com> Date: Tue Nov 7 15:59:40 2023 +0100 DNA chaining (#17) * seed repetition cutoff * remove overhead * back to array * test * repetitive filter * changed float to double for parser * fixed count typo * removed io header * min heap seed filtering * fix heap * filter multithreaded * multithreaded optimized * locked guard * test new multithreading filter * filter multithreaded optimized * pass thread index during creation of thread * check smallest element before adding to queue * remove unordered set, replace by checking against cutoff value * refactor * new heuristics * add ungapped extension to set * new timer * temp * new timer * ssh * d * Delete Arabidopsis_thaliana.TAIR10.dna_sm.toplevel.fa.gz * new timer * timer * remove timer * temp * Delete Arabidopsis_thaliana.TAIR10.dna_sm.toplevel.fa.gz * temp * Delete Arabidopsis_thaliana.TAIR10.dna.chromosome.Pt.fa.gz * temp * Delete Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz * refactor * seed filter * target filter * target filter * target filter * revert * r_ungapped to score * fix * wfadaptive * setheuristicnone * remove ungapped eval * repetition cutoff 0 * fix filter for small index * backup chaining and apple specifics * revert * test ksw2 for mac * chaining and bug fixes and apple specifics * backtracking * backup chaining, with old out commented code, 1 bug left * working_chaining * add dna chaining condition * basic chaining * Delete src/align/align.cpp.save * repetition-cutoff into WITH_DNA * Delete libwfa2cpp.a * add fast approximation of log2 * Delete setup.cpp.save * delete binary obj * remove key in RepetitiveCounter * removed vector of priority_queues as class member * Update dna_index.cpp check size repetitive before adding to queue * extended struct SeedMatch * fixed bug 100 % ungapped score * struct for data an chain * removed std output * update chain.cpp * change built queue * add sorting before chaining * chaining only for one target at a time * head * comments * include wfa2 in comment * include wfa comment * Update log2_fast.h * chain_end two parameters * fixed bug backtrack * make variables const if possible * parameters struct * struct anchor data * update documentation * backtrack simplification * update backtrack * delete wrong commit * remove RepetitiveCounter struct * updated chain struct * construction Chain Object * chain Standard Komparator * use iterator * correction syntax * fixed index t_id * move semantics --------- Co-authored-by: Dimi <dimitrios_K@gmx.de> Co-authored-by: Dimitrios Koutsogiannis <dkoutso@taco.eb.local> Co-authored-by: Dimi99 <73211787+Dimi99@users.noreply.github.com> Co-authored-by: vinceaps <90271444+vinceaps@users.noreply.github.com> Co-authored-by: Benjamin Buchfink <buchfink@gmail.com> commit 19c730d59620efa02e9b7a6928283962767a64c3 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Nov 7 15:54:53 2023 +0100 Add output field for lineage. commit 7cb96085fa2950f0d500e468331d0f7a8367c182 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Nov 7 10:48:11 2023 +0100 Check for identical block. commit 8438e3387278dad3405ed9c2f7b1b6a29be94407 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Oct 26 14:23:56 2023 +0200 remove alignment computation. commit 52649efac7cc76c4ab7822c7848cd74763582c07 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Oct 25 17:35:38 2023 +0200 Filter by mutual coverage. commit 1bfa4e2a82e52fb91241c48eeb349a29f7dbed53 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Oct 25 17:08:11 2023 +0200 Added alignment. commit 1149045d6c55fcf40604248e57c8c93b3ddbd4ab Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Oct 25 12:55:15 2023 +0200 Fix centroid lookup. commit dcef57b958768f36c69dc6a1c19527082a043f9e Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Oct 24 16:35:01 2023 +0200 Added parallel processing. commit c0ebcc97a3f0201ca5a271199795a144f97c49c5 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Oct 24 13:48:51 2023 +0200 Fixed bug. commit ff8b053116a7df875e59fafff7e933da10c2bddf Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Oct 24 13:43:18 2023 +0200 Check for indirection. commit 539416b50a933cde0594c9a110b4945d6c366d57 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Oct 24 12:33:32 2023 +0200 Write output. commit 0fdfe23a8695efeb8e824fb07e79a959e5ce317a Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Oct 23 16:59:07 2023 +0200 Process alignments. commit 83b0442e0efc20f28e703d16ebf52503015189d0 Merge: 8d0ce419 5171828f Author: Benjamin Buchfink <buchfink@gmail.com> Date: Sat Oct 21 14:39:44 2023 +0200 Merge branch 'dev' of https://github.com/bbuchfink/diamond_dev into dev commit 8d0ce419c8bf9c835403ca1afd1ee0b8be5d4b5e Author: Benjamin Buchfink <buchfink@gmail.com> Date: Sat Oct 21 14:39:34 2023 +0200 Fixed error. commit 5171828faf97348653d5447f4765e8c041075c58 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Oct 20 14:50:12 2023 +0200 Fixed error. commit 367ac23fc68acbb4c195d6136f5d83d2e24a9770 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Oct 19 17:38:31 2023 +0200 Added profile alignment. commit 03b51a98073196fb03494979fa03ee917448b376 Merge: 13a3e512 6096ef28 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Oct 18 16:26:27 2023 +0200 Merge branch 'dev' of https://github.com/bbuchfink/diamond_dev into dev commit 13a3e512efd362d6746359fd17580fc4dfee6908 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Oct 18 16:26:08 2023 +0200 Fixed blastdb. commit 6096ef2841d6e0c87d58292b4238eb9ce3d6b17d Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Oct 17 16:53:37 2023 +0200 Added local alignment of profiles. commit 8a5eca0d72348012775177f929447ddcfd5db0a7 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Oct 13 15:32:15 2023 +0200 Fix leak. commit d2563fd8c85c6891064ce2a8010a5c38e94c63b9 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Oct 13 15:04:16 2023 +0200 Added profile-recluster workflow. commit d4b638e4fee52ae4024d35e109e81bed1f9ee32d Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Oct 12 17:03:33 2023 +0200 Added MSA computation. commit a7e375040af808cdea6ae2bfc9a43b2fb285b1db Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Oct 12 14:24:55 2023 +0200 Added FAMSA. commit 88119a03726d7288c406237d0b975c1ad304c426 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Sep 27 16:52:47 2023 +0200 Use letter count of subdb in clustering. commit bed1a484f7f28beb654d12817431c8a7610faca0 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Sep 27 16:28:07 2023 +0200 Added length lookup. commit 81843d6fa42081219f89d0833838b8d481dfec23 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Sep 26 13:54:08 2023 +0200 Use coverage increment in earlier rounds. commit 53a20c61e5f4e68b41505bc08dcb776070cbca35 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Sep 26 13:40:00 2023 +0200 Use ccd=0 in linear stages. commit e5699a10fb0b8da76f1152ac5773d3cafdb304a1 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Sep 18 16:52:45 2023 +0200 Use median sequence. commit da749bc1fb1826bebbeadd80ad8a46adc70dc118 Merge: eb336685 a1c35d03 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Sep 18 11:44:00 2023 +0200 Merge branch 'dev' of https://github.com/bbuchfink/diamond_dev into dev commit eb33668574a28a84ec585613c8d31de644d89ff6 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Sep 18 11:43:25 2023 +0200 Fixed subject source range. commit a1c35d03f4ba0f4a56461d431df6dbde277b85eb Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Sep 11 14:50:49 2023 +0200 Auto set min length ratio. commit 612df402ca9fafc2c5cfed2d54b62354041edb8f Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Sep 11 13:43:35 2023 +0200 Added mutual cover clustering. commit c909a44381fa52d6a359d44b607fedc59690eeff Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Sep 8 11:15:33 2023 +0200 Added connected component depth. commit b613c12c6d4f5abaa9914075626414f39392b7b0 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Sep 6 16:58:28 2023 +0200 Added cc clustering. commit abd593504534f23632b98787f5053d5d93d23dc4 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Sep 5 17:04:45 2023 +0200 Added callback for bidirectional clustering. commit 76b222058f50e2fee6664159e0df47cb518b667b Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Sep 5 15:46:27 2023 +0200 Added --no-reassign to gvc. commit 268090426a40c2b22f04b8a0ced5101b6195ba5d Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Sep 5 10:31:51 2023 +0200 Fixed approx_id for anchored swipe. commit 4acb106d2495d0cd67481567607c81184081fbf9 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Aug 28 14:56:25 2023 +0200 Fixed bug. commit ccae0326a15e745e07c54edced47e3284b92c949 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Aug 28 13:26:12 2023 +0200 Added length ratio filter. commit 43eb577fd11eca0a84901b558013a409edcfd5c6 Merge: 49c3fffb 7d89b2b0 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Aug 25 16:04:42 2023 +0200 Merge branch 'dev' of https://github.com/bbuchfink/diamond_dev into dev commit 49c3fffb58f22405b1496b2f18fe8c04e35298b2 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Aug 25 16:03:43 2023 +0200 Added len ratio filter. commit 7d89b2b0197901a8727e2814ea35605e2cf5cdaf Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Jul 20 16:12:54 2023 +0200 Fixed warning. commit 6c4c4d7994f80f8603027cabf1d966ead048b428 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Thu Jul 20 16:02:58 2023 +0200 Added sorting. commit 97c350241a5b309b3fc1862b3666365834cfbe39 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jul 19 11:23:49 2023 +0200 Fixed full_qqual field. commit 4ba1920cbe2d4ed3cb45d803126296a2bcc1a775 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jul 18 18:50:34 2023 +0200 Add len-ratio filter. commit 58bf56ec153b9f5944b672db9f136aec0e1af705 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Tue Jul 18 16:19:29 2023 +0200 Added symmetric option to gvc. commit 4d8f6d6aee2d895d7a62ace47689432dc31e6de3 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Mon Jul 17 16:43:32 2023 +0200 Added mutual cov lin-stage1. commit 6693d01a2004346cd35f209f448aa5803a347737 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Jul 14 16:59:11 2023 +0200 Added linclust stage with mutual cov. commit dd3fb04acc23760257664ea6c4f6127dd576f8c6 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Jul 14 12:41:03 2023 +0200 Added mutual coverage. commit 97c1e2e54dd5a540800b022a7e3bb513ae5da429 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jul 12 15:55:04 2023 +0200 Fixed sam query field. commit 2af1fa4028f30bbb9da4363d80f268ead311e3f5 Merge: e1d1c047 23a1ba7 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Wed Jul 12 15:54:11 2023 +0200 Merge branch 'master' into dev commit e1d1c047d6459ced42b85ce897a5d42a7640714e Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Jun 23 15:11:53 2023 +0200 Added length ratio filter. commit 59a52cd0bcf1458bc311b36ba3911fd00bee1319 Merge: 561b9632 af4fc64 Author: Benjamin Buchfink <buchfink@gmail.com> Date: Fri Jun 23 14:24:04 2023 +0200 Merge branch 'master' into dev commit 561b96324e32731019b3e3ca7d3df5b54caa9217 Merge: 70409de0 749de49c Author: Dimi99 <73211787+Dimi99@users.noreply.github.com> Date: Wed Jun 21 14:21:55 2023 +0200 Merge pull request #12 from Dimi99/dev Add the WFA Extension-Option commit 749de49c7d90b6fee3c331bd009fb86dfcce4557 Merge: 141f517e 52cdcbae Author: Dimi <dimitrios_K@gmx.de> Date: Wed Jun 21 14:13:18 2023 +0200 Merge branch 'dev' of github.com:Dimi99/diamond_dev into dev commit 141f517ee52480501ac3a79b72be8ca05818cde0 Author: Dimi <dimitrios_K@gmx.de> Date: Wed Jun 21 14:13:09 2023 +0200 changes commit 52cdcbae9023e38c94235778a33abb4542425b6a Merge: c6c9bfff 70409de0 Author: Dimi99 <73211787+Dimi99@users.noreply.github.com> Date: Wed Jun 21 12:35:12 2023 +0200 Merge branch 'dev' into dev commit c6c9bfff21d275992a55da542553120d6af0231a Author: Dimi <dimitrios_K@gmx.de> Date: Wed Jun 21 12:31:25 2023 +0200 cmake commit 9a19d041c72f09aca50a41913b2adb8e4f10eae8 Author: Dimi <dimitrios_K@gmx.de> Date: Wed Jun 21 12:30:59 2023 +0200 cmake commit 5dd3ac5e6cd6db4062e2da9b726d155ba2f823e6 Author: Dimi <dimitrios_K@gmx.de> Date: Wed Jun 21 12:23:41 2023 +0200 timer
bbuchfink · Jan 31, 2024 · ba08828 · ba08828
1 parent 56f5d14
commit ba08828
Showing 200 changed files with 6,108 additions and 448,714 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -18,7 +18,6 @@ option(DP_STAT "DP_STAT" OFF)
 option(SINGLE_THREADED "SINGLE_THREADED" OFF)
 option(EIGEN_BLAS "EIGEN_BLAS" OFF)
 option(WITH_ZSTD "WITH_ZSTD" OFF)
-option(KEEP_TARGET_ID "KEEP_TARGET_ID" OFF)
 option(HIT_KEEP_TARGET_ID "HIT_KEEP_TARGET_ID" OFF)
 option(LONG_SEEDS "LONG_SEEDS" OFF)
 option(WITH_AVX512 "WITH_AVX512" OFF)
@@ -76,10 +75,6 @@ if(EXTRA)
   add_definitions(-DEXTRA)
 endif()
 
-if(KEEP_TARGET_ID)
-  add_definitions(-DKEEP_TARGET_ID)
-endif()
-
 if(HIT_KEEP_TARGET_ID)
   add_definitions(-DHIT_KEEP_TARGET_ID)
 endif()
@@ -100,7 +95,12 @@ if(WITH_MCL)
   add_definitions(-DWITH_MCL)
 endif()
 
+if(WITH_FAMSA)
+  add_definitions(-DWITH_FAMSA)
+endif()
+
 add_definitions(-DMAX_SHAPE_LEN=${MAX_SHAPE_LEN})
+add_definitions(-D_ITERATOR_DEBUG_LEVEL=0)
 
 IF(STATIC_LIBGCC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libgcc")
@@ -146,13 +146,13 @@ endif()
 
 if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+  add_definitions(-D_HAS_STD_BYTE=0)
 else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-implicit-fallthrough -Wreturn-type -Wno-unused -Wno-unused-parameter -Wno-unused-variable -Wno-uninitialized -Wno-deprecated-copy -Wno-unknown-warning-option")
-  # -fsanitize=address -fno-omit-frame-pointer
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-implicit-fallthrough -Wreturn-type -Wno-unused -Wno-unused-parameter -Wno-unused-variable -Wno-uninitialized -Wno-deprecated-copy -Wno-unknown-warning-option ")#-g -fsanitize=address -fno-omit-frame-pointer ")
 endif()
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pragma-clang-attribute -Wno-overloaded-virtual -Wno-missing-braces")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pragma-clang-attribute -Wno-overloaded-virtual -Wno-missing-braces") #-g -fsanitize=thread -fno-omit-frame-pointer" )
 endif()
 
 if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5)
@@ -391,8 +391,6 @@ set(OBJECTS
         src/util/tsv/merge.cpp
         src/util/tsv/join.cpp
         src/dp/scalar/smith_waterman.cpp
-        src/cluster/incremental/config.cpp
-        src/cluster/incremental/run.cpp
         src/align/short.cpp
         src/tools/tsv.cpp
         )
@@ -402,10 +400,14 @@ if(WITH_DNA)
           src/dna/smith_watermann.cpp
           src/dna/dna_index.cpp
           src/dna/seed_set_dna.cpp
+          src/dna/extension.cpp
+          src/dna/chain.cpp
+          src/dna/extension_chain.cpp
+          #src/dna/align.cpp
           src/lib/ksw2/ksw2_extz2_sse.c
-          src/dna/ksw2_extension.cpp
+          src/lib/ksw2/ksw2_extz.c
           src/lib/WFA2-lib.diamond/bindings/cpp/WFAligner.cpp
-          )
+  )
 endif()
 
 if(WITH_MCL)
@@ -482,15 +484,10 @@ if(BLAST_INCLUDE_DIR)
   endif()
 endif()
 
-
-
 if(WITH_DNA)
   include_directories(src/lib/WFA2-lib.diamond)
   add_subdirectory(src/lib/WFA2-lib.diamond)
-
   target_link_libraries(diamond  wfa2)
-
-  target_compile_options(wfa2 PRIVATE -DCMAKE_BUILD_TYPE=Release -DEXTRA_FLAGS="-ftree-vectorize -msse2 -mfpmath=sse -ftree-vectorizer-verbose=5")
 endif()
 
 if(EIGEN_BLAS)

diff --git a/src/ChangeLog b/src/ChangeLog
@@ -1,3 +1,33 @@
+[2.1.9]
+- Corrected the prefix of the query length field for the SAM format.
+- Added the size modifiers 'T', 'M' and 'K' for the `--memory-limit`/`-M`
+  option.
+- Added the option `--mutual-cover` to cluster sequences by mutual coverage
+  percentage of the cluster representative and member sequence.
+- Added the option `--symmetric` for computing greedy vertex cover with
+  symmetric edges.
+- Fixed an issue that caused the `--approx-id` option and the `approx_pident`
+  output field not to work correctly when using the `--anchored-swipe`
+  option.
+- Added the option `--no-reassign` to prevent reassignment to closest
+  representative for the greedy vertex cover workflow.
+- Added the option `--connected-component-depth` to activate clustering
+  of connected components at a given maximum depth for the greedy vertex
+  cover and the clustering workflows.
+- Fixed a compiler error for Clang v17.
+- Improved search performance when searching with mutual coverage threshold
+  by filtering for sequence length ratio.
+- Added the sensitivity mode `--shapes-30x10` with sensitivity approximately
+  equivalent to `--mid-sensitive`.
+- Added the options `--round-coverage` and `--round-approx-id` to set per
+  round cutoffs for cascaded clustering.
+- The CMake switch `-DKEEP_TARGET_ID` is now obsolete and the corresponding
+  function is always available.
+- Added the option `--include-lineage` to the taxonomic classification format
+  to include taxonomic lineage in the output.
+- Added native support for the ARM NEON instruction set (contributed by
+  Martin Larralde).
+
 [2.1.8]
 - Fixed an issue that could cause reduced performance when running in
   query-indexed mode.

diff --git a/src/align/align.cpp b/src/align/align.cpp
@@ -32,8 +32,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #endif
 #include "extend.h"
 #ifdef WITH_DNA
-#include "../dna/wfa2_test.h"
-#include "../dna/ksw2_extension.h"
+#include "../dna/extension.h"
 #endif
 #include "../util/algo/radix_sort.h"
 #include "target.h"
@@ -103,7 +102,7 @@ struct HitIterator {
 				r.push_back(Hits{ (BlockId)i,nullptr,nullptr });
 			return r;
 		}
-		if (i >= partition.size() - 1)
+		if (i >= int64_t(partition.size()) - 1)
 			return r;
 		const BlockId c = align_mode.query_contexts;
 		Search::Hit* begin = data + partition[i], * end = data + partition[i + 1];
@@ -120,7 +119,7 @@ struct HitIterator {
 			++it;
 			++last_query;
 		}
-		if (i == partition.size() - 2) {
+		if (i == (int64_t)partition.size() - 2) {
 			r.reserve(r.size() + query_end - (r.back().query + 1));
 			for (BlockId j = r.back().query + 1; j < query_end; ++j)
 				r.push_back(Hits{ j, nullptr,nullptr });
@@ -189,7 +188,7 @@ TextBuffer* legacy_pipeline(const HitIterator::Hits& hits, Search::Config& cfg,
 		TextBuffer *buf = nullptr;
 		if (!cfg.blocked_processing && *cfg.output_format != OutputFormat::daa && config.report_unaligned != 0) {
 			buf = new TextBuffer;
-			Output::Info info{ cfg.query->seq_info(hits.query), true, cfg.db.get(), *buf, {} };
+			Output::Info info{ cfg.query->seq_info(hits.query), true, cfg.db.get(), *buf, {}, AccessionParsing() };
 			cfg.output_format->print_query_intro(info);
 			cfg.output_format->print_query_epilog(info);
 		}

diff --git a/src/align/extend.cpp b/src/align/extend.cpp
@@ -66,6 +66,7 @@ namespace Extension {
 const std::map<Sensitivity, Mode> default_ext_mode = {
 	{ Sensitivity::FASTER, Mode::BANDED_FAST},
 	{ Sensitivity::FAST, Mode::BANDED_FAST},
+	{ Sensitivity::SHAPES30x10, Mode::BANDED_FAST},
 	{ Sensitivity::DEFAULT, Mode::BANDED_FAST},
 	{ Sensitivity::MID_SENSITIVE, Mode::BANDED_FAST},
 	{ Sensitivity::SENSITIVE, Mode::BANDED_FAST},

diff --git a/src/align/global_ranking/table.cpp b/src/align/global_ranking/table.cpp
@@ -31,6 +31,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #endif
 #include "../load_hits.h"
 #include "../dp/ungapped.h"
+#include "../search/search.h"
 
 using std::endl;
 using std::thread;
@@ -45,44 +46,45 @@ namespace Extension { namespace GlobalRanking {
 static void get_query_hits(SeedHits::Iterator begin, SeedHits::Iterator end, vector<Hit>& hits, Search::Config& cfg) {
 	hits.clear();
 	const SequenceSet& target_seqs = cfg.target->seqs();
-#ifdef KEEP_TARGET_ID
-	auto get_target = [](const Search::Hit& hit) { return (BlockId)hit.subject_; };
-	auto it = merge_keys(begin, end, get_target);
-	while (it.good()) {
-		uint16_t score = 0;
-		for (SeedHits::Iterator i = it.begin(); i != it.end(); ++i)
-			score = std::max(score, i->score_);
-		hits.emplace_back((uint32_t)cfg.target->block_id2oid(it.key()), score, 0);
-		++it;
+	if (Search::keep_target_id(cfg)) {
+		auto get_target = [](const Search::Hit& hit) { return (BlockId)hit.subject_; };
+		auto it = merge_keys(begin, end, get_target);
+		while (it.good()) {
+			uint16_t score = 0;
+			for (SeedHits::Iterator i = it.begin(); i != it.end(); ++i)
+				score = std::max(score, i->score_);
+			hits.emplace_back((uint32_t)cfg.target->block_id2oid(it.key()), score, 0);
+			++it;
+		}
 	}
-#else
+	else {
 #ifdef BATCH_BINSEARCH
-	vector<Hit> hit1;
-	hit1.reserve(end - begin);
-	target_seqs.local_position_batch(begin, end, std::back_inserter(hit1), Search::Hit::CmpTargetOffset());
-	for (size_t i = 0; i < hit1.size(); ++i) {
-		hit1[i].score = begin[i].score_;
-	}
-	auto it = merge_keys(hit1.begin(), hit1.end(), Hit::Target());
-	while (it.good()) {
-		uint16_t score = 0;
-		for (auto i = it.begin(); i != it.end(); ++i)
-			score = std::max(score, i->score);
-		hits.emplace_back((uint32_t)cfg.target->block_id2oid(it.key()), score);
-		++it;
-	}
+		vector<Hit> hit1;
+		hit1.reserve(end - begin);
+		target_seqs.local_position_batch(begin, end, std::back_inserter(hit1), Search::Hit::CmpTargetOffset());
+		for (size_t i = 0; i < hit1.size(); ++i) {
+			hit1[i].score = begin[i].score_;
+		}
+		auto it = merge_keys(hit1.begin(), hit1.end(), Hit::Target());
+		while (it.good()) {
+			uint16_t score = 0;
+			for (auto i = it.begin(); i != it.end(); ++i)
+				score = std::max(score, i->score);
+			hits.emplace_back((uint32_t)cfg.target->block_id2oid(it.key()), score);
+			++it;
+		}
 #else
-	auto get_target = [&target_seqs](const Search::Hit& hit) { return target_seqs.local_position((uint64_t)hit.subject_).first; };
-	auto it = merge_keys(begin, end, get_target);
-	while (it.good()) {
-		uint16_t score = 0;
-		for (SeedHits::Iterator i = it.begin(); i != it.end(); ++i)
-			score = std::max(score, i->score_);
-		hits.emplace_back((uint32_t)cfg.target->block_id2oid(it.key()), score, 0);
-		++it;
-	}
-#endif
+		auto get_target = [&target_seqs](const Search::Hit& hit) { return target_seqs.local_position((uint64_t)hit.subject_).first; };
+		auto it = merge_keys(begin, end, get_target);
+		while (it.good()) {
+			uint16_t score = 0;
+			for (SeedHits::Iterator i = it.begin(); i != it.end(); ++i)
+				score = std::max(score, i->score_);
+			hits.emplace_back((uint32_t)cfg.target->block_id2oid(it.key()), score, 0);
+			++it;
+		}
 #endif
+	}
 }
 
 static pair<int, unsigned> target_score(const FlatArray<Extension::SeedHit>::DataIterator begin, const FlatArray<Extension::SeedHit>::DataIterator end, const Sequence* query_seq, const Sequence& target_seq) {

diff --git a/src/align/legacy/query_mapper.cpp b/src/align/legacy/query_mapper.cpp
@@ -223,7 +223,7 @@ bool QueryMapper::generate_output(TextBuffer &buffer, Statistics &stat, const Se
 	size_t seek_pos = 0;
 	const char *query_title = metadata.query->ids()[query_id];
 	unique_ptr<OutputFormat> f(cfg.output_format->clone());
-	Output::Info info{ cfg.query->seq_info(query_id), true, cfg.db.get(), buffer, {} };
+	Output::Info info{ cfg.query->seq_info(query_id), true, cfg.db.get(), buffer, {}, AccessionParsing() };
 
 	for (size_t i = 0; i < targets.size(); ++i) {
 

diff --git a/src/align/output.cpp b/src/align/output.cpp
@@ -38,7 +38,7 @@ TextBuffer* generate_output(vector<Match> &targets, const Extension::Stats& stat
 	std::unique_ptr<OutputFormat> f(cfg.output_format->clone());
 	size_t seek_pos = 0;
 	unsigned n_hsp = 0, hit_hsps = 0;
-	Output::Info info{ cfg.query->seq_info(query_block_id), targets.empty(), cfg.db.get(), *out, stats };
+	Output::Info info{ cfg.query->seq_info(query_block_id), targets.empty(), cfg.db.get(), *out, stats, AccessionParsing() };
 	TranslatedSequence query = query_seqs.translated_seq(align_mode.query_translated ? cfg.query->source_seqs()[query_block_id] : query_seqs[query_block_id], query_block_id * align_mode.query_contexts);
 	const char *query_title = cfg.query->ids()[query_block_id];
 	const double query_self_aln_score = cfg.query->has_self_aln() ? cfg.query->self_aln_score(query_block_id) : 0.0;

diff --git a/src/align/ungapped.cpp b/src/align/ungapped.cpp
@@ -81,7 +81,7 @@ WorkTarget ungapped_stage(FlatArray<SeedHit>::DataIterator begin, FlatArray<Seed
 		return target;
 	}
 	std::sort(begin, end);
-	const bool with_diag_filter = config.hamming_ext || config.diag_filter_cov > 0 || config.diag_filter_id > 0;
+	const bool with_diag_filter = (config.hamming_ext || config.diag_filter_cov > 0 || config.diag_filter_id > 0) && !config.mutual_cover.present();
 	for (FlatArray<SeedHit>::DataIterator hit = begin; hit < end; ++hit) {
 		const auto f = hit->frame;
 		target.ungapped_score[f] = std::max(target.ungapped_score[f], hit->score);
@@ -139,6 +139,9 @@ vector<WorkTarget> ungapped_stage(const Sequence *query_seq, const Bias_correcti
 	}
 	else {
 		for (int64_t i = 0; i < n; ++i) {
+			/*const double len_ratio = query_seq->length_ratio(target_block.seqs()[target_block_ids[i]]);
+			if (len_ratio < config.min_length_ratio)
+				continue;*/
 			targets.push_back(ungapped_stage(seed_hits.begin(i), seed_hits.end(i), query_seq, query_cb, query_comp, &query_matrix, target_block_ids[i], stat, target_block, mode));
 			for (const ApproxHsp& hsp : targets.back().hsp[0]) {
 				Geo::assert_diag_bounds(hsp.d_max, query_seq[0].length(), targets.back().seq.length());

diff --git a/src/basic/basic.cpp b/src/basic/basic.cpp
@@ -29,7 +29,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../util/util.h"
 #include "../stats/standard_matrix.h"
 
-const char* Const::version_string = "2.1.8";
+const char* Const::version_string = "2.1.9";
 using std::string;
 using std::vector;
 using std::count;
@@ -51,7 +51,7 @@ AlignMode::AlignMode(unsigned mode) :
     case blastn:
         input_sequence_type = SequenceType::nucleotide;
         query_translated = false;
-        query_contexts = 2;
+        query_contexts = 1;
         query_len_factor = 1;
         sequence_type = SequenceType::nucleotide;
         break;

diff --git a/src/basic/config.cpp b/src/basic/config.cpp
@@ -63,6 +63,7 @@ const EMap<Sensitivity> EnumTraits<Sensitivity>::to_string = {
 	{ Sensitivity::FAST, "fast" },
 	{ Sensitivity::DEFAULT, "default" },
 	{ Sensitivity::MID_SENSITIVE, "mid-sensitive" },
+	{ Sensitivity::SHAPES30x10, "shapes-30x10" },
 	{ Sensitivity::SENSITIVE, "sensitive" },
 	{ Sensitivity::MORE_SENSITIVE, "more-sensitive" },
 	{ Sensitivity::VERY_SENSITIVE, "very-sensitive" },
@@ -74,6 +75,7 @@ const SEMap<Sensitivity> EnumTraits<Sensitivity>::from_string = {
 	{ "fast", Sensitivity::FAST },
 	{ "default", Sensitivity::DEFAULT },
 	{ "mid-sensitive", Sensitivity::MID_SENSITIVE },
+	{ "shapes-30x10", Sensitivity::SHAPES30x10 },
 	{ "sensitive", Sensitivity::SENSITIVE },
 	{ "more-sensitive", Sensitivity::MORE_SENSITIVE },
 	{ "very-sensitive", Sensitivity::VERY_SENSITIVE },
@@ -91,6 +93,7 @@ const SEMap<Config::Algo> EnumTraits<Config::Algo>::from_string = { {"", Config:
 const EMap<SequenceType> EnumTraits<SequenceType>::to_string = { {SequenceType::amino_acid,"prot"}, { SequenceType::nucleotide,"nucl"} };
 const SEMap<SequenceType> EnumTraits<SequenceType>::from_string = { {"prot",SequenceType::amino_acid}, {"nucl",SequenceType::nucleotide} };
 
+
 Config config;
 
 pair<double, int> block_size(int64_t memory_limit, Sensitivity s, bool lin) {
@@ -99,7 +102,7 @@ pair<double, int> block_size(int64_t memory_limit, Sensitivity s, bool lin) {
 		c = m < 40.0 && s <= Sensitivity::MORE_SENSITIVE && min == 1 ? 4 : 1;
 	const double min_factor = std::min(1 / (double)min * 2, 1.0);
 	const double max = s <= Sensitivity::DEFAULT ? 12.0 :
-		(s <= Sensitivity::MORE_SENSITIVE ? 4.0 : 0.4);
+		(s <= Sensitivity::MORE_SENSITIVE ? 6.0 : 2.0);
 	double b = m / (18.0 * min_factor / c + 2.0);
 	/*if (b > 4)
 		b = floor(b);
@@ -109,8 +112,6 @@ pair<double, int> block_size(int64_t memory_limit, Sensitivity s, bool lin) {
 		b = floor(b * 1000) / 1000;*/
 	if (!config.no_block_size_limit && !lin)
 		b = std::min(b, max);
-	if (s >= Sensitivity::VERY_SENSITIVE)
-		b = std::min(b, 2.1);
 	//if (s >= Sensitivity::ULTRA_SENSITIVE)
 		//b = std::min(b, 0.6);
 	return { std::max(b, 0.001), c };
@@ -263,6 +264,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 		("faster", 0, "enable faster mode", mode_faster)
 		("fast", 0, "enable fast mode", mode_fast)
 		("mid-sensitive", 0, "enable mid-sensitive mode", mode_mid_sensitive)
+		("shapes-30x10", 0, "enable mode using 30 seed shapes of weight 10", mode_shapes30x10)
 		("sensitive", 0, "enable sensitive mode)", mode_sensitive)
 		("more-sensitive", 0, "enable more sensitive mode", mode_more_sensitive)
 		("very-sensitive", 0, "enable very sensitive mode", mode_very_sensitive)
@@ -328,10 +330,12 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 \tsstart means Start of alignment in subject\n\
 \tsend means End of alignment in subject\n\
 \tqseq means Aligned part of query sequence\n\
+\tqseq_gapped means Aligned part of query sequence (with gaps)\n\
 \tqseq_translated means Aligned part of query sequence (translated)\n\
 \tfull_qseq means Query sequence\n\
 \tfull_qseq_mate means Query sequence of the mate\n\
 \tsseq means Aligned part of subject sequence\n\
+\tsseq_gapped means Aligned part of subject sequence (with gaps)\n\
 \tfull_sseq means Subject sequence\n\
 \tevalue means Expect value\n\
 \tbitscore means Bit score\n\
@@ -362,33 +366,41 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 \tqqual means Query quality values for the aligned part of the query\n\
 \tfull_qqual means Query quality values\n\
 \tqstrand means Query strand\n\
-\n\tDefault: qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", output_format);
+\n\tDefault: qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", output_format)
+("include-lineage", 0, "Include lineage in the taxonomic classification format", include_lineage)
+;
 
 	auto& cluster_opt = parser.add_group("Clustering options", { cluster, RECLUSTER, DEEPCLUST, LINCLUST });
 	kmer_ranking = false;
 	cluster_opt.add()
 		("cluster-steps", 0, "Clustering steps", cluster_steps)
-#ifdef KEEP_TARGET_ID
 		("kmer-ranking", 0, "Rank sequences based on kmer frequency in linear stage", kmer_ranking)
-#endif
-		;
+		("round-coverage", 0, "Per-round coverage cutoffs for cascaded clustering", round_coverage)
+		("round-approx-id", 0, "Per-round approx-id cutoffs for cascaded clustering", round_approx_id);
 
 	auto& cluster_reassign_opt = parser.add_group("Clustering/reassign options", { cluster, RECLUSTER, CLUSTER_REASSIGN, GREEDY_VERTEX_COVER, DEEPCLUST, LINCLUST });
 	cluster_reassign_opt.add()
 		("memory-limit", 'M', "Memory limit in GB (default = 16G)", memory_limit)
-		("member-cover", 0, "Minimum coverage% of the cluster member sequence (default=80.0)", member_cover, 80.0);
+		("member-cover", 0, "Minimum coverage% of the cluster member sequence (default=80.0)", member_cover)
+		("mutual-cover", 0, "Minimum mutual coverage% of the cluster member and representative sequence", mutual_cover);
 
 	auto& gvc_opt = parser.add_group("GVC options", { GREEDY_VERTEX_COVER });
 	gvc_opt.add()
 		("centroid-out", 0, "Output file for centroids", centroid_out)
 		("edges", 0, "Input file for greedy vertex cover", edges)
-		("edge-format", 0, "Edge format for greedy vertex cover (default/triplet)", edge_format);
+		("edge-format", 0, "Edge format for greedy vertex cover (default/triplet)", edge_format)
+		("symmetric", 0, "Edges are symmetric", symmetric)
+		("no-reassign", 0, "Do not reassign to closest representative", no_gvc_reassign)
+		("connected-component-depth", 0, "Depth to cluster connected components", connected_component_depth);
 
 	auto& realign_opt = parser.add_group("Cluster input options", { CLUSTER_REALIGN, RECLUSTER, CLUSTER_REASSIGN });
 	realign_opt.add()
 		("clusters", 0, "Clustering input file mapping sequences to representatives", clustering);
 
 	string algo_str;
+#ifdef WITH_DNA
+    string dna_extension_string;
+#endif
 
 	auto& advanced_gen = parser.add_group("Advanced/general", { blastp, blastx, blastn, CLUSTER_REASSIGN, regression_test, cluster, DEEPCLUST, LINCLUST, makedb });
 	advanced_gen.add()
@@ -443,8 +455,19 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 		("heartbeat", 0, "", heartbeat)
 		("mp-self", 0, "", mp_self)
 #ifdef EXTRA
-            ("zdrop", 'z', "zdrop for gapped dna alignment", zdrop, 40)
+        ("zdrop", 'z', "zdrop for gapped dna alignment", zdrop, 40)
+#endif
+#ifdef WITH_DNA
+        ("repetition-cutoff", 0 ,"filter out top FLOAT fraction of repetitive minimizers ",repetitive_cutoff,0.0002)
+        ("extension", 0, "extension algorithm (wfa, ksw=default)", dna_extension_string, string("ksw"))
+        ("chaining-out", 0, "use chaining without extension", chaining_out)
+        ("align-long-reads", 0, "use chaining with extension", align_long_reads)
+        ("chain-pen-gap-scale", 0, "scaling factor for the chaining gap penalty", chain_pen_gap_scale, 0.8)
+        ("chain-pen-skip-scalee", 0, "scaling factor for the chaining skip penalty", chain_pen_skip_scale, 0.0)
+        ("penalty", 0, "blastn mismatch penalty", mismatch_penalty, -3)
+        ("reward", 0, "blastn match reward", match_reward, 2)
 #endif
+
 		("query-or-subject-cover", 0, "", query_or_target_cover);
 
 	auto& view_align_options = parser.add_group("View/Align options", { view, blastp, blastx });
@@ -622,8 +645,6 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 		("diag-filter-cov", 0, "", diag_filter_cov)
 		("strict-gvc", 0, "", strict_gvc)
 		("dbtype", 0, "type of sequences in database file (nucl/prot)", dbstring, string("prot"))
-		("penalty", 0, "blastn mismatch penalty", mismatch_penalty, -3)
-		("reward", 0, "blastn match reward", match_reward, 2)
 		("cluster-similarity", 0, "Clustering similarity measure (default=\"normalized_bitscore_global\")", cluster_similarity)
 		("cluster-threshold", 0, "Threshold for the similarity measure (default=50%)", cluster_threshold)
 		("cluster-graph-file", 0, "Filename for dumping the graph or reading the graph if cluster-restart", cluster_graph_file)
@@ -646,14 +667,12 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 		("no_8bit_extension", 0, "", no_8bit_extension)
 		("anchored-swipe", 0, "", anchored_swipe)
 		("no_chaining_merge_hsps", 0, "", no_chaining_merge_hsps)
-		("recluster_bd", 0, "", recluster_bd)
 		("pipeline-short", 0, "", pipeline_short)
 		("graph-algo", 0, "", graph_algo, string("gvc"))
 		("tsv-read-size", 0, "", tsv_read_size, int64_t(GIGABYTES))
-#ifndef KEEP_TARGET_ID
-		("kmer-ranking", 0, "Rank sequences based on kmer frequency in linear stage", kmer_ranking)
-#endif
-		;
+		("min-len-ratio", 0, "", min_length_ratio)
+		("max-indirection", 0, "", max_indirection)
+		("aln-out", 0, "", aln_out);
 
 	parser.store(argc, argv, command);
 
@@ -870,9 +889,13 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 	if (mode_more_sensitive) set_sens(Sensitivity::MORE_SENSITIVE);
 	if (mode_very_sensitive) set_sens(Sensitivity::VERY_SENSITIVE);
 	if (mode_ultra_sensitive) set_sens(Sensitivity::ULTRA_SENSITIVE);
+	if (mode_shapes30x10) set_sens(Sensitivity::SHAPES30x10);
 
 	algo = from_string<Algo>(algo_str);
     dbtype = from_string<SequenceType>(dbstring);
+#ifdef WITH_DNA
+    dna_extension = from_string<DNAExtensionAlgo>(dna_extension_string);
+#endif
 	Translator::init(query_gencode);
 
 	if (command == blastx || command == blastn)
@@ -934,9 +957,6 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 #endif
 #ifdef STRICT_BAND
 	log_stream << " STRICT_BAND";
-#endif
-#ifdef KEEP_TARGET_ID
-	log_stream << " KEEP_TARGET_ID";
 #endif
 	log_stream << endl;
 }
diff --git a/src/basic/config.h b/src/basic/config.h
@@ -29,7 +29,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../util/options/option.h"
 #include "value.h"
 
-enum class Sensitivity { FASTER = -1, FAST = 0, DEFAULT = 1, MID_SENSITIVE = 2, SENSITIVE = 3, MORE_SENSITIVE = 4, VERY_SENSITIVE = 5, ULTRA_SENSITIVE = 6};
+enum class Sensitivity { FASTER = -1, FAST = 0, DEFAULT = 1, MID_SENSITIVE = 2, SHAPES30x10 = 3, SENSITIVE = 4, MORE_SENSITIVE = 5, VERY_SENSITIVE = 6, ULTRA_SENSITIVE = 7};
 enum class Compressor;
 
 template<> struct EnumTraits<Sensitivity> {
@@ -42,6 +42,12 @@ template<> struct EnumTraits<SequenceType> {
     static const SEMap<SequenceType> from_string;
 };
 
+enum class DNAExtensionAlgo{KSW,WFA};
+template<> struct EnumTraits<DNAExtensionAlgo>{
+    static const EMap<DNAExtensionAlgo> to_string;
+    static const SEMap<DNAExtensionAlgo> from_string;
+};
+
 enum class GraphAlgo { GREEDY_VERTEX_COVER, LEN_SORTED };
 
 template<> struct EnumTraits<GraphAlgo> {
@@ -309,7 +315,7 @@ struct Config
 	string unaligned_targets;
 	Option<double> approx_min_id;
 	bool mode_faster;
-	double member_cover;
+	Option<double> member_cover;
 	bool weighted_gvc;
 	bool kmer_ranking;
 	bool hamming_ext;
@@ -331,15 +337,33 @@ struct Config
 	bool no_8bit_extension;
 	bool anchored_swipe;
 	bool no_chaining_merge_hsps;
-	bool recluster_bd;
 	bool pipeline_short;
 	string graph_algo;
 	bool linsearch;
 	int64_t tsv_read_size;
     int zdrop;
 	bool heartbeat;
-	bool no_parse_seqids;
-	bool sam_qlen_field;
+    bool no_parse_seqids;
+    bool sam_qlen_field;
+#ifdef WITH_DNA
+  DNAExtensionAlgo dna_extension;
+  double repetitive_cutoff;
+  bool chaining_out;
+  bool align_long_reads;
+  double chain_pen_gap_scale;
+  double chain_pen_skip_scale;
+#endif
+    double min_length_ratio;
+	Option<double> mutual_cover;
+	bool symmetric;
+	bool no_gvc_reassign;
+	string_vector connected_component_depth;
+	std::vector<string> round_coverage;
+	string_vector round_approx_id;
+	int max_indirection;
+	bool mode_shapes30x10;
+	string aln_out;
+	bool include_lineage;
 
     SequenceType dbtype;
 

diff --git a/src/basic/const.h b/src/basic/const.h
@@ -25,7 +25,7 @@ struct Const
 {
 
 	enum {
-		build_version = 162,
+		build_version = 163,
 #ifdef SINGLE_THREADED
 		seedp_bits = 0,
 #else

diff --git a/src/basic/hssp.cpp b/src/basic/hssp.cpp
@@ -55,6 +55,7 @@ HspContext& HspContext::parse(const OutputFormat* output_format)
 			TranslatedPosition(hsp_.query_range.begin_, Frame(hsp_.frame)),
 			TranslatedPosition(hsp_.query_range.end_, Frame(hsp_.frame)),
 			(int)query.source().length());
+		hsp_.subject_source_range = hsp_.subject_range;
 		if (subject_seq.length() > 0)
 			hsp_.approx_id = hsp_.approx_id_percent(this->query.index(hsp_.frame), subject_seq);
 		return *this;
@@ -95,6 +96,7 @@ HspContext& HspContext::parse(const OutputFormat* output_format)
 
 	hsp_.query_range.end_ = i.query_pos.translated;
 	hsp_.subject_range.end_ = i.subject_pos;
+	hsp_.subject_source_range = hsp_.subject_range;
 	hsp_.query_source_range = TranslatedPosition::absolute_interval(begin().query_pos, i.query_pos, (int)query.source().length());
 	if (subject_seq.length() > 0)
 		hsp_.approx_id = hsp_.approx_id_percent(this->query.index(hsp_.frame), subject_seq);
@@ -290,8 +292,6 @@ Hsp::Hsp(const IntermediateRecord &r, unsigned query_source_len, Loc qlen, Loc t
 	backtraced(!IntermediateRecord::stats_mode(output_format->hsp_values) && output_format->hsp_values != HspValues::NONE),
 	score(r.score),
 	evalue(r.evalue),
-	bit_score(score_matrix.bitscore(r.score)),
-	corrected_bit_score(score_matrix.bitscore_corrected(r.score, qlen, tlen)),
 	transcript(r.transcript)
 {
     if(dna_score_builder == nullptr){

diff --git a/src/basic/match.h b/src/basic/match.h
@@ -71,6 +71,10 @@ struct Hsp
 		bit_score(0.0),
 		corrected_bit_score(0.0),
 		approx_id(0.0),
+#ifdef WITH_DNA
+        mapping_quality(0),
+        n_anchors(0),
+#endif
 		matrix(nullptr)
 	{}
 
@@ -91,6 +95,10 @@ struct Hsp
 		bit_score(0.0),
 		corrected_bit_score(0.0),
 		approx_id(0.0),
+#ifdef WITH_DNA
+        mapping_quality(0),
+        n_anchors(0),
+#endif
 		matrix(nullptr)
 	{}
 
@@ -252,7 +260,10 @@ struct Hsp
 	std::pair<int, int> diagonal_bounds() const;
 	bool backtraced;
 	int score, frame, length, identities, mismatches, positives, gap_openings, gaps, swipe_target, d_begin, d_end, reserved1, reserved2;
-	Interval query_source_range, query_range, subject_range;
+#if WITH_DNA
+    int mapping_quality, n_anchors;
+#endif
+	Interval query_source_range, query_range, subject_source_range,subject_range;
 	double evalue, bit_score, corrected_bit_score, approx_id;
 	Sequence target_seq;
 	const Stats::TargetMatrix* matrix;
@@ -392,11 +403,19 @@ struct HspContext
 	double approx_id_percent() const {
 		return hsp_.approx_id_percent(query.index(hsp_.frame), subject_seq);
 	}
+#if WITH_DNA
+    unsigned mapping_quality() const
+    { return hsp_.mapping_quality; }
+    unsigned n_anchors() const
+    { return hsp_.n_anchors; }
+#endif
 	double qcovhsp() const;
 	double scovhsp() const;
 	double id_percent() const;
 	const Interval& query_source_range() const
 	{ return hsp_.query_source_range; }
+    const Interval& subject_source_range() const
+    { return hsp_.subject_source_range; }
 	const Interval& query_range() const
 	{ return hsp_.query_range; }
 	const Interval& subject_range() const
@@ -464,6 +483,10 @@ struct TypeSerializer<HspContext> {
 		buf_->write(h.evalue());
 		buf_->write(h.score());
 		buf_->write(h.approx_id());
+#if WITH_DNA
+        buf_->write(h.mapping_quality());
+        buf_->write(h.n_anchors());
+#endif
 		return *this;
 	}
 
@@ -503,6 +526,10 @@ struct TypeDeserializer<HspContext> {
 		file_->read(h.hsp_.evalue);
 		file_->read(h.hsp_.score);
 		file_->read(h.hsp_.approx_id);
+#ifdef WITH_DNA
+        file_->read(h.hsp_.mapping_quality);
+        file_->read(h.hsp_.n_anchors);
+#endif
 		h.hsp_.query_source_range = h.hsp_.query_range;
 		return h;
 	}

diff --git a/src/basic/seed_iterator.h b/src/basic/seed_iterator.h
@@ -185,6 +185,7 @@ struct HashedSeedIterator
 			return *this;
 		}
 		++ptr_;
+		return *this;
 	}
 	Letter* seq_ptr(const Shape& sh) const {
 		return ptr_ - sh.length_;

diff --git a/src/basic/sequence.h b/src/basic/sequence.h
@@ -217,6 +217,9 @@ struct Sequence
 	double masked_letter_ratio() const {
 		return (double)masked_letters() / len_;
 	}
+	double length_ratio(const Sequence& seq) const {
+		return len_ < seq.len_ ? (double)len_ / seq.len_ : (double)seq.len_ / len_;
+	}
 	static std::vector<Letter> from_string(const char* str, const ValueTraits&vt = value_traits);
 
 	Loc len_;

diff --git a/src/basic/translated_position.h b/src/basic/translated_position.h
@@ -120,7 +120,9 @@ struct TranslatedPosition
 
 	int absolute(int dna_len) const
 	{
-		if (!align_mode.query_translated)
+        if(!frame.offset &&  align_mode.mode == AlignMode::blastn)
+           return dna_len - 1 - translated;
+		if (!align_mode.query_translated && frame.strand == FORWARD)
 			return translated;
 		return oriented_position(in_strand(), frame.strand, dna_len);
 	}

diff --git a/src/cluster/cascaded/cascaded.cpp b/src/cluster/cascaded/cascaded.cpp
@@ -29,6 +29,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../../run/workflow.h"
 
 const char* const DEFAULT_MEMORY_LIMIT = "16G";
+const double CASCADED_ROUND_MAX_EVALUE = 0.001;
 
 using std::vector;
 using std::shared_ptr;
@@ -56,23 +57,34 @@ static BitVector rep_bitset(const vector<SuperBlockId> &centroid, const BitVecto
 	return r;
 }
 
-vector<SuperBlockId> cluster(shared_ptr<SequenceFile>& db, const shared_ptr<BitVector>& filter, const SuperBlockId* member_counts, bool last_round) {
+vector<SuperBlockId> cluster(shared_ptr<SequenceFile>& db, const shared_ptr<BitVector>& filter, const SuperBlockId* member_counts, int round, int round_count) {
 	using Edge = Util::Algo::Edge<SuperBlockId>;
 	statistics.reset();
+	const bool mutual_cover = config.mutual_cover.present();
 	config.command = Config::blastp;
 	config.output_format = { "edge" };
-	config.query_cover = 0;
-	config.subject_cover = 0;
-	config.query_or_target_cover = config.member_cover;
+	if (config.mutual_cover.present()) {
+		const vector<string> round_coverage = config.round_coverage.empty() ? default_round_cov(round_count) : config.round_coverage;
+		config.query_cover = config.subject_cover = std::max(config.mutual_cover.get_present(), round_value(round_coverage, "--round-coverage", round, round_count));
+	}
+	else {
+		config.query_cover = 0;
+		config.subject_cover = 0;
+		config.query_or_target_cover = config.member_cover;
+	}
 	config.algo = Config::Algo::DOUBLE_INDEXED;
 	config.max_target_seqs_ = INT64_MAX;
 	config.self = true;
 	config.iterate.unset();
 	config.mapany = false;
 	config.linsearch = false;
+	//if (config.db_size == 0 && filter) {
+	if(filter) {
+		config.db_size = db->letters_filtered(*filter);
+	}
 	tie(config.chunk_size, config.lowmem_) = block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), config.sensitivity, config.lin_stage1);
 
-	shared_ptr<Callback> callback(new Callback);
+	shared_ptr<Callback> callback(mutual_cover ? (Callback*)new CallbackBidirectional : (Callback*)new CallbackUnidirectional);
 
 	Search::run(db, nullptr, callback, filter);
 
@@ -83,14 +95,18 @@ vector<SuperBlockId> cluster(shared_ptr<SequenceFile>& db, const shared_ptr<BitV
 	InputFile f(callback->edge_file);
 	f.read(edges.data(), callback->count);
 	f.close_and_delete();
+	if (!config.aln_out.empty())
+		output_edges(config.aln_out, *db, edges);
 	timer.go("Sorting edges");
 	db->reopen();
 	FlatArray<Edge> edge_array = make_flat_array_dense(move(edges), (SuperBlockId)db->sequence_count(), config.threads_, Edge::GetKey());
 	timer.finish();
 
 	const auto algo = from_string<GraphAlgo>(config.graph_algo);
+	const bool last_round = round == round_count - 1;
+	const int ccd = round_ccd(round, round_count);
 	return algo == GraphAlgo::GREEDY_VERTEX_COVER ?
-		Util::Algo::greedy_vertex_cover(edge_array, config.weighted_gvc ? member_counts : nullptr, last_round && !config.strict_gvc)
+		Util::Algo::greedy_vertex_cover(edge_array, config.weighted_gvc ? member_counts : nullptr, last_round && !config.strict_gvc && !mutual_cover, !config.no_gvc_reassign, ccd)
 		: len_sorted_clust(edge_array);
 }
 
@@ -107,6 +123,8 @@ vector<SuperBlockId> cascaded(shared_ptr<SequenceFile>& db, bool linear) {
 	if (db->sequence_count() > (int64_t)numeric_limits<SuperBlockId>::max())
 		throw runtime_error("Workflow supports a maximum of " + to_string(numeric_limits<SuperBlockId>::max()) + " input sequences.");
 	const auto steps = cluster_steps(config.approx_min_id, linear);
+	const double evalue_cutoff = config.max_evalue,
+		target_approx_id = config.approx_min_id;
 	shared_ptr<BitVector> oid_filter(new BitVector);
 	int64_t cluster_count = db->sequence_count();
 	vector<SuperBlockId> centroids(cluster_count);
@@ -116,12 +134,18 @@ vector<SuperBlockId> cascaded(shared_ptr<SequenceFile>& db, bool linear) {
 		TaskTimer timer;
 		config.lin_stage1 = ends_with(steps[i], "_lin");
 		config.sensitivity = from_string<Sensitivity>(rstrip(steps[i], "_lin"));
+		const vector<string> round_approx_id = config.round_approx_id.empty() ? default_round_approx_id(steps.size()) : config.round_approx_id;
+		config.approx_min_id = std::max(target_approx_id, round_value(round_approx_id, "--round-approx-id", i, steps.size()));
+		config.max_evalue = (size_t)i == steps.size() - 1 ? evalue_cutoff : std::min(evalue_cutoff, CASCADED_ROUND_MAX_EVALUE);
 		tie(centroids, *oid_filter) = update_clustering(*oid_filter,
 			centroids,
-			cluster(db, i == 0 ? nullptr : oid_filter, config.weighted_gvc ? member_counts(centroids).data() : nullptr, i == (int)steps.size() - 1),
+			cluster(db, i == 0 ? nullptr : oid_filter, config.weighted_gvc ? member_counts(centroids).data() : nullptr, i, (int)steps.size()),
 			i);
 		const int64_t n = oid_filter->one_count();
-		message_stream << "Clustering round " << i + 1 << " complete. #Input sequences: " << cluster_count << " #Clusters: " << n << " Time: " << timer.seconds() << 's' << endl;
+		message_stream << "Clustering round " << i + 1 << " complete. #Input sequences: " << cluster_count
+			<< " #Clusters: " << n
+			<< " #Letters: " << db->letters_filtered(*oid_filter)
+			<< " Time: " << timer.seconds() << 's' << endl;
 		cluster_count = n;
 	}
 	return centroids;

diff --git a/src/cluster/cascaded/cascaded.h b/src/cluster/cascaded/cascaded.h
@@ -35,12 +35,21 @@ struct Cascaded : public ClusteringAlgorithm {
 
 std::vector<SuperBlockId> cascaded(std::shared_ptr<SequenceFile>& db, bool linear);
 std::vector<std::string> cluster_steps(double approx_id, bool linear);
+std::vector<std::string> default_round_approx_id(int steps);
+std::vector<std::string> default_round_cov(int steps);
+int round_ccd(int round, int round_count);
 
 struct Callback : public Consumer {
 	using Edge = Util::Algo::Edge<SuperBlockId>;
 	Callback() :
 		count(0)
 	{}
+	virtual void consume(const char* ptr, size_t n) override = 0;
+	TempFile edge_file;
+	int64_t count;
+};
+
+struct CallbackUnidirectional : public Callback {
 	virtual void consume(const char* ptr, size_t n) override {
 		const char* end = ptr + n;
 		while (ptr < end) {
@@ -56,8 +65,22 @@ struct Callback : public Consumer {
 			}
 		}
 	}
-	TempFile edge_file;
-	int64_t count;
+};
+
+struct CallbackBidirectional : public Callback {
+	virtual void consume(const char* ptr, size_t n) override {
+		const char* end = ptr + n;
+		while (ptr < end) {
+			const auto edge = *(Output::Format::Edge::Data*)ptr;
+			ptr += sizeof(Output::Format::Edge::Data);
+			if (edge.query != edge.target) {
+				edge_file.write(Edge((SuperBlockId)edge.target, (SuperBlockId)edge.query, edge.evalue));
+				edge_file.write(Edge((SuperBlockId)edge.query, (SuperBlockId)edge.target, edge.evalue));
+				count += 2;
+			}
+		}
+	}
 };
 
 }
+
diff --git a/src/cluster/cascaded/helpers.cpp b/src/cluster/cascaded/helpers.cpp
@@ -18,8 +18,10 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
+#include <sstream>
 #include "cascaded.h"
 
+using std::stringstream;
 using std::string;
 using std::vector;
 
@@ -39,4 +41,45 @@ vector<string> cluster_steps(double approx_id, bool linear) {
 	return v;
 }
 
+vector<string> default_round_approx_id(int steps) {
+	switch (steps) {
+	case 1:
+		return {};
+	case 2:
+		return { "27.0" };
+	default:
+		return { "27.0", "0.0" };
+	}
+}
+
+vector<string> default_round_cov(int steps) {
+	switch (steps) {
+	case 1:
+		return {};
+	case 2:
+		return { "85.0" };
+	default:
+		return { "87.0", "85.0" };
+	}
+}
+
+static int round_ccd(int round) {
+	stringstream ss(config.connected_component_depth[round]);
+	int i;
+	ss >> i;
+	if (ss.fail() || !ss.eof())
+		throw std::runtime_error("Invalid number format for --connected-component-depth");
+	return i;
+}
+
+int round_ccd(int round, int round_count) {
+	if (config.connected_component_depth.size() > 1 && config.connected_component_depth.size() != (size_t)round_count)
+		throw std::runtime_error("Parameter count for --connected-component-depth has to be 1 or the number of cascaded clustering rounds.");
+	if (config.connected_component_depth.size() == 0)
+		return 0;
+	if (config.connected_component_depth.size() == 1)
+		return round_ccd(0);
+	return round_ccd(round);
+}
+
 }
diff --git a/src/cluster/cascaded/recluster.cpp b/src/cluster/cascaded/recluster.cpp
@@ -50,12 +50,16 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
 	BitVector centroid_aligned(db->sequence_count());
 	for_each(centroids.begin(), centroids.end(), [&centroid_aligned](OId c) { centroid_aligned.set(c); });
 	function<void(const HspContext&)> callback([&centroid_aligned](const HspContext& h) {
-		if (h.scovhsp() >= config.member_cover && (h.approx_id() >= config.approx_min_id || h.id_percent() >= config.approx_min_id))
+		if (((config.mutual_cover.present() && h.qcovhsp() >= config.mutual_cover.get_present() && h.scovhsp() >= config.mutual_cover.get_present())
+			|| (!config.mutual_cover.present() && h.scovhsp() >= config.member_cover))
+			&& (h.approx_id() >= config.approx_min_id || h.id_percent() >= config.approx_min_id))
 			centroid_aligned.set(h.subject_oid);
 	});
 	timer.finish();
 
 	HspValues hsp_values = HspValues::TARGET_COORDS;
+	if (config.mutual_cover.present())
+		hsp_values |= HspValues::QUERY_COORDS;
 	if (config.approx_min_id > 0)
 		hsp_values |= HspValues::QUERY_COORDS | HspValues::IDENT | HspValues::LENGTH;
 	realign(clusters, centroids, *db, callback, hsp_values);
@@ -80,9 +84,14 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
 	config.iterate = vector<string>();
 	config.output_format = { "edge" };
 	config.self = false;
-	config.query_cover = config.recluster_bd ? 0 : config.member_cover;
-	config.subject_cover = 0;
-	config.query_or_target_cover = config.recluster_bd ? config.member_cover : 0;
+	if (config.mutual_cover.present()) {
+		config.query_cover = config.subject_cover = config.mutual_cover.get_present();
+	}
+	else {
+		config.query_cover = config.member_cover;
+		config.subject_cover = 0;
+	}	
+	config.query_or_target_cover = 0;
 	config.sensitivity = from_string<Sensitivity>(cluster_steps(config.approx_min_id, false).back());
 	//tie(config.chunk_size, config.lowmem_) = block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), Search::iterated_sens.at(config.sensitivity).front(), false);
 	config.lowmem_ = 1;
@@ -106,50 +115,8 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
 		return out;
 
 	shared_ptr<SequenceFile> unmapped;
-	if (!config.recluster_bd) {
-		timer.go("Creating database of unmapped sequences");
-		unmapped.reset(unaligned->sub_db(unmapped_members.cbegin(), unmapped_members.cend()));
-	}
-	else {
-		timer.go("Loading covered centroids list");
-		vector<pair<OId, OId>> covered_centroids = mapback->targets_covered();
-		timer.finish();
-		message_stream << "#Centroid sequences tentatively covered: " << covered_centroids.size() << endl;
-		timer.go("Filtering list");
-		ips4o::parallel::sort(covered_centroids.begin(), covered_centroids.end());
-		vector<OId> centroid_list;
-		auto it = covered_centroids.begin();
-		auto it2 = unmapped_members.begin();
-		while (it < covered_centroids.end() && it2 < unmapped_members.end()) {
-			if (it->first < *it2)
-				++it;
-			else if (*it2 < it->first)
-				++it2;
-			else {
-				centroid_list.push_back(it->second);
-				++it;
-			}
-		}
-		ips4o::parallel::sort(centroid_list.begin(), centroid_list.end());
-		auto end = std::unique(centroid_list.begin(), centroid_list.end());
-		const int64_t n = end - centroid_list.begin();
-		timer.finish();
-		message_stream << "#Centroid sequences covered: " << n << endl;
-		timer.go("Making sequence list for reclustering");
-		for (int64_t i = 0; i < (int64_t)unmapped_members.size(); ++i)
-			unmapped_members[i] = unal_members[unmapped_members[i]];
-		const vector<OId> members = cluster_members(centroid_list.begin(), end, clusters);
-		unmapped_members.reserve(unmapped_members.size() + members.size());
-		unmapped_members.insert(unmapped_members.end(), members.begin(), members.end());
-		//unmapped_members.reserve(unmapped_members.size() + n);
-		//for (auto i = centroid_list.begin(); i < end; ++i)
-			//unmapped_members.push_back(centroids[*i]);
-		ips4o::parallel::sort(unmapped_members.begin(), unmapped_members.end());
-		timer.finish();
-		message_stream << "#Sequences from covered clustes: " << members.size() << endl;
-		timer.go("Creating database for reclustering");
-		unmapped.reset(db->sub_db(unmapped_members.cbegin(), unmapped_members.cend()));
-	}
+	timer.go("Creating database of unmapped sequences");
+	unmapped.reset(unaligned->sub_db(unmapped_members.cbegin(), unmapped_members.cend()));
 	mapback.reset();
 	timer.finish();
 
@@ -176,7 +143,7 @@ void recluster() {
 	config.clustering.require();
 	init_thresholds();
 	//config.strict_gvc = true;
-	message_stream << "Coverage cutoff: " << config.member_cover << '%' << endl;
+	message_stream << "Coverage cutoff: " << (config.mutual_cover.present() ? config.mutual_cover.get_present() : config.member_cover) << '%' << endl;
 
 	TaskTimer timer("Opening the database");
 	shared_ptr<SequenceFile> db(SequenceFile::auto_create({ config.database }, SequenceFile::Flags::NEED_LETTER_COUNT | SequenceFile::Flags::ACC_TO_OID_MAPPING | SequenceFile::Flags::OID_TO_ACC_MAPPING, SequenceFile::Metadata()));

diff --git a/src/cluster/cascaded/wrapper.cpp b/src/cluster/cascaded/wrapper.cpp
@@ -1,6 +1,6 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2019-2023 Max Planck Society for the Advancement of Science e.V.
+Copyright (C) 2019-2024 Max Planck Society for the Advancement of Science e.V.
 
 Code developed by Benjamin Buchfink <buchfink@gmail.com>
 
@@ -66,7 +66,7 @@ struct Config {
 	double                              block_size;
 	std::unique_ptr<OutputFormat>       output_format;
 	std::shared_ptr<FastaFile>          centroids;
-	TaskTimer                          total_time;
+	TaskTimer                           total_time;
 	int64_t                             seqs_processed;
 	int64_t                             letters_processed;
 	std::vector<OId>                    centroid2oid;
@@ -116,8 +116,13 @@ static vector<SuperBlockId> search_vs_centroids(shared_ptr<FastaFile>& super_blo
 	config.max_target_seqs_ = 1;
 	config.toppercent = 100;
 	config.sensitivity = cfg.sens;
-	config.query_cover = config.member_cover;
-	config.subject_cover = 0;
+	if (config.mutual_cover.present()) {
+		config.query_cover = config.subject_cover = config.mutual_cover.get_present();
+	}
+	else {
+		config.query_cover = config.member_cover;
+		config.subject_cover = 0;
+	}	
 	config.query_or_target_cover = 0;
 	if (cfg.linclust) {
 		config.iterate.unset();
@@ -159,7 +164,7 @@ void Cascaded::run() {
 	config.hamming_ext = config.approx_min_id >= 50.0;
 	TaskTimer total_time;
 	TaskTimer timer("Opening the input file");
-	shared_ptr<SequenceFile> db(SequenceFile::auto_create({ config.database }, SequenceFile::Flags::NEED_LETTER_COUNT | SequenceFile::Flags::OID_TO_ACC_MAPPING));
+	shared_ptr<SequenceFile> db(SequenceFile::auto_create({ config.database }, SequenceFile::Flags::NEED_LETTER_COUNT | SequenceFile::Flags::OID_TO_ACC_MAPPING | SequenceFile::Flags::NEED_LENGTH_LOOKUP));
 	if (db->type() == SequenceFile::Type::BLAST)
 		throw std::runtime_error("Clustering is not supported for BLAST databases.");
 	timer.finish();

diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h
@@ -37,6 +37,8 @@ class ClusteringAlgorithm {
 
 namespace Cluster {
 
+const double DEFAULT_MEMBER_COVER = 80.0;
+
 struct CentroidSorted {};
 
 void realign(const FlatArray<OId>& clusters, const std::vector<OId>& centroids, SequenceFile& db, std::function<void(const HspContext&)>& callback, HspValues hsp_values);
@@ -60,6 +62,8 @@ std::vector<SuperBlockId> member_counts(const std::vector<SuperBlockId>& mapping
 Util::Tsv::File* open_out_tsv();
 void init_thresholds();
 std::vector<BlockId> len_sorted_clust(const FlatArray<Util::Algo::Edge<SuperBlockId>>& edges);
+void output_edges(const std::string& file, SequenceFile& db, const std::vector<Util::Algo::Edge<SuperBlockId>>& edges);
+double round_value(const std::vector<std::string>& par, const std::string& name, int round, int round_count);
 
 template<typename Int, typename Int2>
 std::vector<Int2> convert_mapping(const std::vector<Int>& mapping, Int2) {
@@ -71,29 +75,20 @@ std::vector<Int2> convert_mapping(const std::vector<Int>& mapping, Int2) {
 
 struct Mapback : public Consumer {
 	Mapback(int64_t count) :
-		centroid_id(count, -1),
-		count(0)
+		centroid_id(count, -1)
 	{}
 	virtual void consume(const char* ptr, size_t n) {
 		const char* end = ptr + n;
+		const bool mutual_cov = config.mutual_cover.present();
+		const double cutoff = mutual_cov ? config.mutual_cover.get_present() : config.member_cover;
 		OId query = -1;
 		for(const char* p = ptr; p < end; p += sizeof(Output::Format::Edge::Data)) {
 			const auto edge = *(Output::Format::Edge::Data*)p;
 			assert(query == -1 || query == edge.query);
 			query = edge.query;
-			if (edge.qcovhsp >= config.member_cover)
+			if (edge.qcovhsp >= cutoff && ((mutual_cov && edge.scovhsp >= cutoff) || !mutual_cov))
 				centroid_id[edge.query] = edge.target;
 		}
-		if (query >= 0 && centroid_id[query] == -1) {
-			for (const char* p = ptr; p < end; p += sizeof(Output::Format::Edge::Data)) {
-				const auto edge = *(Output::Format::Edge::Data*)p;
-				if (edge.scovhsp >= config.member_cover) {
-					covered_centroids.write((OId)query);
-					covered_centroids.write((OId)edge.target);
-					++count;
-				}
-			}
-		}
 	}
 	std::vector<OId> unmapped() const {
 		std::vector<OId> v;
@@ -102,17 +97,7 @@ struct Mapback : public Consumer {
 				v.push_back(i);
 		return v;
 	}
-	std::vector<std::pair<OId, OId>> targets_covered() {
-		std::vector<std::pair<OId, OId>> v;
-		v.resize(count);
-		InputFile f(covered_centroids);
-		f.read(v.data(), count);
-		f.close_and_delete();
-		return v;
-	}
 	std::vector<OId> centroid_id;
-	TempFile covered_centroids;
-	int64_t count;
 };
 
 template<typename It, typename It2>

diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h
@@ -23,7 +23,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../contrib/mcl/mcl.h"
 #endif
 #include "cascaded/cascaded.h"
-#include "incremental/incremental.h"
+//#include "incremental/incremental.h"
 
 namespace Workflow { namespace Cluster{
 class ClusterRegistryStatic{
@@ -35,7 +35,7 @@ class ClusterRegistryStatic{
 		regMap[MCL::get_key()] = new MCL();
 #endif
 		regMap[::Cluster::Cascaded::get_key()] = new ::Cluster::Cascaded();		
-		regMap[::Cluster::Incremental::Algo::get_key()] = new ::Cluster::Incremental::Algo();
+		//regMap[::Cluster::Incremental::Algo::get_key()] = new ::Cluster::Incremental::Algo();
 	}
 	~ClusterRegistryStatic(){
 		for(auto it = regMap.begin(); it != regMap.end(); it++){

diff --git a/src/cluster/helpers.cpp b/src/cluster/helpers.cpp
@@ -19,6 +19,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
 #include <fstream>
+#include <sstream>
 #include "cluster.h"
 #include "../util/tsv/tsv.h"
 #include "../util/string/tokenizer.h"
@@ -41,6 +42,7 @@ using std::floor;
 using std::unique_ptr;
 using std::back_inserter;
 using std::less;
+using std::stringstream;
 using namespace Util::Tsv;
 
 namespace Cluster {
@@ -113,6 +115,7 @@ vector<Int> member2centroid_mapping(const FlatArray<Int>& clusters, const vector
 	}
 	return v;
 }
+template vector<int64_t> member2centroid_mapping(const FlatArray<int64_t>&, const vector<int64_t>&);
 
 template<typename Int>
 pair<FlatArray<Int>, vector<Int>> cluster_sorted(const vector<Int>& mapping) {
@@ -198,13 +201,17 @@ vector<SuperBlockId> member_counts(const vector<SuperBlockId>& mapping) {
 }
 
 void init_thresholds() {
+	if (config.member_cover.present() && config.mutual_cover.present())
+		throw std::runtime_error("--member-cover and --mutual-cover are mutually exclusive.");
+	if (!config.mutual_cover.present())
+		config.member_cover.set_if_blank(DEFAULT_MEMBER_COVER);
 	if (!config.approx_min_id.present())
 		config.approx_min_id = config.command == ::Config::DEEPCLUST ? 0.0 : (config.command == ::Config::LINCLUST ? 90.0 : 50.0);
 	if (config.soft_masking.empty())
 		config.soft_masking = "tantan";
 	if (!config.masking_.present())
 		config.masking_ = "0";
-	if (config.approx_min_id < 90.0)
+	if (config.approx_min_id < 90.0 || config.mutual_cover.present())
 		return;
 	if(config.command == ::Config::CLUSTER_REASSIGN) {
 		config.diag_filter_id = 80.0;
@@ -236,4 +243,32 @@ vector<BlockId> len_sorted_clust(const FlatArray<Util::Algo::Edge<SuperBlockId>>
 	return v;
 }
 
+void output_edges(const string& file, SequenceFile& db, const vector<Util::Algo::Edge<SuperBlockId>>& edges) {
+	File out(Schema{ Type::STRING, Type::STRING }, file, Flags::WRITE);
+	const Util::Tsv::Table acc_mapping = db.seqid_file().read(config.threads_);
+	for (const auto& e : edges) {
+		out.write_record(acc_mapping[e.node1].get<string>(0), acc_mapping[e.node2].get<string>(0));
+	}
+}
+
+double round_value(const vector<string>& par, const string& name, int round, int round_count) {
+	if (par.empty())
+		return 0.0;
+	if (round >= round_count - 1)
+		return 0.0;
+	if ((int64_t)par.size() >= round_count)
+		throw std::runtime_error("Too many values provided for " + name);
+	vector<double> v;
+	for(const string& s : par) {
+		stringstream ss(s);
+		double i;
+		ss >> i;
+		if (ss.fail() || !ss.eof())
+			throw std::runtime_error("Invalid value provided for " + name + ": " + s);
+		v.push_back(i);
+	}
+	v.insert(v.begin(), round_count - 1 - v.size(), v.front());
+	return v[round];
+}
+
 }
diff --git a/src/cluster/incremental/common.h b/src/cluster/incremental/common.h
diff --git a/src/cluster/incremental/config.cpp b/src/cluster/incremental/config.cpp
diff --git a/src/cluster/incremental/incremental.h b/src/cluster/incremental/incremental.h
diff --git a/src/cluster/incremental/run.cpp b/src/cluster/incremental/run.cpp
diff --git a/src/cluster/realign.cpp b/src/cluster/realign.cpp
@@ -73,7 +73,7 @@ void realign() {
 
 	TextBuffer buf;
 	function<void(const HspContext&)> format_output([&buf, &out, &output_format](const HspContext& h) {
-		Output::Info info{ SeqInfo(), false, nullptr, buf, {} };
+		Output::Info info{ SeqInfo(), false, nullptr, buf, {}, AccessionParsing() };
 		info.query.title = h.query_title.c_str();
 		output_format->print_match(h, info);
 		out.write(buf.data(), buf.size());

diff --git a/src/cluster/reassign.cpp b/src/cluster/reassign.cpp
@@ -39,7 +39,8 @@ namespace Cluster {
 void reassign() {
 	config.database.require();
 	config.clustering.require();
-	message_stream << "Coverage cutoff: " << config.member_cover << '%' << endl;
+	init_thresholds();
+	message_stream << "Coverage cutoff: " << (config.mutual_cover.present() ? config.mutual_cover.get_present() : config.member_cover) << '%' << endl;
 
 	TaskTimer timer("Opening the database");
 	shared_ptr<SequenceFile> db(SequenceFile::auto_create({ config.database }, SequenceFile::Flags::NEED_LETTER_COUNT | SequenceFile::Flags::ACC_TO_OID_MAPPING | SequenceFile::Flags::OID_TO_ACC_MAPPING, SequenceFile::Metadata()));
@@ -65,12 +66,16 @@ void reassign() {
 	timer.finish();
 
 	statistics.reset();
-	init_thresholds();
 	config.command = Config::blastp;
 	config.max_target_seqs_ = 1;
 	config.output_format = { "edge" };
 	config.self = false;
-	config.query_cover = config.member_cover;
+	if (config.mutual_cover.present())
+		config.query_cover = config.subject_cover = config.mutual_cover.get_present();
+	else {
+		config.query_cover = config.member_cover;
+		config.subject_cover = 0;
+	}
 	config.sensitivity = from_string<Sensitivity>(cluster_steps(config.approx_min_id, false).back());
 	shared_ptr<Mapback> mapback = make_shared<Mapback>(members.size());
 	Search::run(centroid_db, member_db, mapback);

diff --git a/src/data/blastdb/blastdb.cpp b/src/data/blastdb/blastdb.cpp
@@ -295,7 +295,7 @@ int BlastDB::build_version()
 	return 0;
 }
 
-void BlastDB::create_partition_balanced(size_t max_letters)
+void BlastDB::create_partition_balanced(int64_t max_letters)
 {
 	throw OperationNotSupported();
 }

diff --git a/src/data/blastdb/blastdb.h b/src/data/blastdb/blastdb.h
@@ -36,7 +36,7 @@ struct BlastDB : public SequenceFile {
 	virtual Metadata metadata() const override;
 	virtual std::string taxon_scientific_name(TaxId taxid) const override;
 	virtual int build_version() override;
-	virtual void create_partition_balanced(size_t max_letters) override;
+	virtual void create_partition_balanced(int64_t max_letters) override;
 	virtual void save_partition(const std::string& partition_file_name, const std::string& annotation = "") override;
 	virtual int get_n_partition_chunks() override;
 	virtual void set_seqinfo_ptr(OId i) override;

diff --git a/src/data/block/block_wrapper.cpp b/src/data/block/block_wrapper.cpp
@@ -77,7 +77,7 @@ bool BlockWrapper::read_seq(vector<Letter>& seq, string& id, std::vector<char>*
 	throw OperationNotSupported();
 }
 
-void BlockWrapper::create_partition_balanced(size_t max_letters) {
+void BlockWrapper::create_partition_balanced(int64_t max_letters) {
 	throw OperationNotSupported();
 }
 

diff --git a/src/data/block/block_wrapper.h b/src/data/block/block_wrapper.h
@@ -8,7 +8,7 @@ struct BlockWrapper : public SequenceFile
 	BlockWrapper(const Block& block, Metadata metadata = Metadata(), Flags flags = Flags::NONE, const ValueTraits& value_traits = amino_acid_traits);
 
 	virtual int64_t file_count() const override;
-	virtual void create_partition_balanced(size_t max_letters) override;
+	virtual void create_partition_balanced(int64_t max_letters) override;
 	virtual void save_partition(const std::string& partition_file_name, const std::string& annotation = "") override;
 	virtual int get_n_partition_chunks() override;
 	virtual void close() override;

diff --git a/src/data/dmnd/dmnd.cpp b/src/data/dmnd/dmnd.cpp
@@ -167,7 +167,7 @@ DatabaseFile::DatabaseFile(const string &input_file, Metadata metadata, Flags fl
 	if (flag_any(metadata, Metadata::TAXON_NODES))
 		taxon_nodes_.reset(new TaxonomyNodes(seek(header2.taxon_nodes_offset), ref_header.build));
 
-	if (flag_any(flags, Flags::ACC_TO_OID_MAPPING | Flags::OID_TO_ACC_MAPPING))
+	if (flag_any(flags, Flags::ACC_TO_OID_MAPPING | Flags::OID_TO_ACC_MAPPING | Flags::NEED_LENGTH_LOOKUP))
 		read_seqid_list();
 }
 
@@ -421,7 +421,7 @@ void DatabaseFile::create_partition_fixednumber(size_t n) {
 	this->create_partition(max_letters_balanced);
 }
 
-void DatabaseFile::create_partition_balanced(size_t max_letters) {
+void DatabaseFile::create_partition_balanced(int64_t max_letters) {
 	//double n = std::ceil(static_cast<double>(ref_header.letters) / static_cast<double>(max_letters));
 	//size_t max_letters_balanced = static_cast<size_t>(std::ceil(static_cast<double>(ref_header.letters)/n));
 	//cout << "Balanced partitioning using " << max_letters_balanced << " (" << max_letters << ")" << endl;
@@ -617,7 +617,9 @@ void DatabaseFile::seq_data(size_t oid, std::vector<Letter>& dst) const
 
 size_t DatabaseFile::seq_length(size_t oid) const
 {
-	throw OperationNotSupported();
+	if (oid < seq_length_.size())
+		return seq_length_[oid];
+	throw std::out_of_range("DatabaseFile::seq_length");
 }
 
 void DatabaseFile::init_random_access(const size_t query_block, const size_t ref_blocks, bool dictionary)
@@ -664,6 +666,8 @@ void DatabaseFile::prep_db() {
 void DatabaseFile::read_seqid_list() {
 	if (flag_any(flags_, Flags::ACC_TO_OID_MAPPING))
 		acc2oid_.reserve(sequence_count());
+	if (flag_any(flags_, Flags::NEED_LENGTH_LOOKUP))
+		seq_length_.reserve(sequence_count());
 	OId oid = 0;
 	vector<Letter> seq;
 	string id;
@@ -674,6 +678,8 @@ void DatabaseFile::read_seqid_list() {
 		if (msg)
 			message_stream << "Warning: " << msg << std::endl;
 		add_seqid_mapping(id, oid++);
+		if (flag_any(flags_, Flags::NEED_LENGTH_LOOKUP))
+			seq_length_.push_back(seq.size());
 	}
 	/*if ((flag_any(flags_, Flags::ACC_TO_OID_MAPPING) && (int64_t)acc2oid_.size() != sequence_count())
 		|| (flag_any(flags_, Flags::OID_TO_ACC_MAPPING) && (int64_t)acc_.size() != sequence_count()))

diff --git a/src/data/dmnd/dmnd.h b/src/data/dmnd/dmnd.h
@@ -86,7 +86,7 @@ struct DatabaseFile : public SequenceFile, public InputFile
 	static bool is_diamond_db(const std::string &file_name);
 
 	void create_partition(size_t max_letters);
-	virtual void create_partition_balanced(size_t max_letters) override;
+	virtual void create_partition_balanced(int64_t max_letters) override;
 	void create_partition_fixednumber(size_t n);
 
 	virtual void save_partition(const std::string& partition_file_name, const std::string& annotation = "") override;

diff --git a/src/data/enum_seeds.h b/src/data/enum_seeds.h
@@ -83,11 +83,12 @@ void enum_seeds_hashed(SequenceSet* seqs, F* f, unsigned begin, unsigned end, co
 			HashedSeedIterator<BITS> it(seqs->ptr(i), seqs->length(i), sh);
 			while (it.good()) {
 				const uint64_t key = *it;
-				if (filter->contains(key, shape_id))
+				if (filter->contains(key, shape_id)) {
 					if (!cfg.filter_low_complexity_seeds || Search::seed_is_complex(it.seq_ptr(sh), sh, cfg.seed_cut))
 						(*f)(key, seqs->position(i, it.seq_ptr(sh) - seq.data()), i, shape_id);
 					else if (cfg.mask_low_complexity_seeds)
 						*it.seq_ptr(sh) |= SEED_MASK;
+				}
 				++it;
 			}
 		}

diff --git a/src/data/fasta/fasta_file.cpp b/src/data/fasta/fasta_file.cpp
@@ -67,8 +67,8 @@ FastaFile::FastaFile(const std::vector<std::string>& file_name, Metadata metadat
 #endif
 }
 
-FastaFile::FastaFile(const string& file_name, bool overwrite, const WriteAccess&):
-	SequenceFile(SequenceFile::Type::FASTA, Alphabet::STD, Flags::NONE, FormatFlags::DICT_LENGTHS | FormatFlags::DICT_SEQIDS, amino_acid_traits),
+FastaFile::FastaFile(const string& file_name, bool overwrite, const WriteAccess&, Flags flags, const ValueTraits& value_traits):
+	SequenceFile(SequenceFile::Type::FASTA, Alphabet::STD, flags, FormatFlags::DICT_LENGTHS | FormatFlags::DICT_SEQIDS, value_traits),
 	out_file_(file_name.empty() ? new TempFile : new OutputFile(file_name, Compressor::NONE, overwrite ? "w+b" : "r+b")),
 	format_(new FASTA_format()),
 	oid_(0),
@@ -157,7 +157,7 @@ bool FastaFile::read_seq(vector<Letter>& seq, string &id, std::vector<char>* qua
 	return r;
 }
 
-void FastaFile::create_partition_balanced(size_t max_letters) {
+void FastaFile::create_partition_balanced(int64_t max_letters) {
 	throw OperationNotSupported();
 }
 
@@ -265,6 +265,8 @@ void FastaFile::seq_data(size_t oid, std::vector<Letter>& dst) const
 
 size_t FastaFile::seq_length(size_t oid) const
 {
+	if (oid < seq_length_.size())
+		return seq_length_[oid];
 	throw OperationNotSupported();
 }
 
@@ -289,6 +291,8 @@ void FastaFile::write_seq(const Sequence& seq, const std::string& id) {
 	Util::Seq::format(seq, id.c_str(), nullptr,  *out_file_, "fasta", value_traits_);
 	++seqs_;
 	letters_ += seq.length();
+	if (flag_any(flags_, Flags::NEED_LENGTH_LOOKUP))
+		seq_length_.push_back(seq.length());
 }
 
 void FastaFile::prep_db(const string& path) {
@@ -321,6 +325,8 @@ std::pair<int64_t, int64_t> FastaFile::init_read() {
 	while (read_seq(seq, id)) {
 		if (flag_any(flags_, Flags::ACC_TO_OID_MAPPING | Flags::OID_TO_ACC_MAPPING))
 			add_seqid_mapping(id, seqs);
+		if (flag_any(flags_, Flags::NEED_LENGTH_LOOKUP))
+			seq_length_.push_back(seq.size());
 		++seqs;
 		letters += seq.size();
 	}

diff --git a/src/data/fasta/fasta_file.h b/src/data/fasta/fasta_file.h
@@ -8,10 +8,10 @@ struct FastaFile : public SequenceFile
 	struct WriteAccess {};
 
 	FastaFile(const std::vector<std::string> &file_name, Metadata metadata = Metadata(), Flags flags = Flags::NONE, const ValueTraits& value_traits = amino_acid_traits);
-	FastaFile(const std::string& file_name, bool overwrite, const WriteAccess&);
+	FastaFile(const std::string& file_name, bool overwrite, const WriteAccess&, Flags flags = Flags::NONE, const ValueTraits& value_traits = amino_acid_traits);
 
 	virtual int64_t file_count() const override;
-	virtual void create_partition_balanced(size_t max_letters) override;
+	virtual void create_partition_balanced(int64_t max_letters) override;
 	virtual void save_partition(const std::string& partition_file_name, const std::string& annotation = "") override;
 	virtual int get_n_partition_chunks() override;
 	virtual void close() override;

diff --git a/src/data/flags.h b/src/data/flags.h
@@ -16,12 +16,12 @@ struct NoFilter
 
 extern NoFilter no_filter;
 
-#ifdef KEEP_TARGET_ID
-struct SeedLoc {
-	SeedLoc() {}
-	SeedLoc(PackedLoc pos) :
+#pragma pack(1)
+struct PackedLocId {
+	PackedLocId() {}
+	PackedLocId(PackedLoc pos) :
 		pos(pos) {}
-	SeedLoc(PackedLoc pos, uint32_t block_id) :
+	PackedLocId(PackedLoc pos, uint32_t block_id) :
 		pos(pos),
 		block_id(block_id)
 	{}
@@ -30,10 +30,16 @@ struct SeedLoc {
 	}
 	PackedLoc pos;
 	uint32_t block_id;
-};
-#else
-using SeedLoc = PackedLoc;
-#endif
+} PACKED_ATTRIBUTE;
+#pragma pack()
+
+static inline uint32_t block_id(PackedLocId i) {
+	return i.block_id;
+}
+
+static inline uint32_t block_id(PackedLoc i) {
+	throw std::runtime_error("Unsupported");
+}
 
 struct EnumCfg {
 	const std::vector<size_t>* partition;

diff --git a/src/data/frequent_seeds.cpp b/src/data/frequent_seeds.cpp
@@ -32,9 +32,10 @@ using std::endl;
 using std::atomic;
 using std::vector;
 
-const double Frequent_seeds::hash_table_factor = 1.3;
-Frequent_seeds frequent_seeds;
+const double FrequentSeeds::hash_table_factor = 1.3;
+FrequentSeeds frequent_seeds;
 
+template<typename SeedLoc>
 static void compute_sd(atomic<unsigned> *seedp, DoubleArray<SeedLoc> *query_seed_hits, DoubleArray<SeedLoc> *ref_seed_hits, vector<Sd> *ref_out, vector<Sd> *query_out)
 {
 	int p;
@@ -49,7 +50,8 @@ static void compute_sd(atomic<unsigned> *seedp, DoubleArray<SeedLoc> *query_seed
 	}
 }
 
-void Frequent_seeds::build_worker(
+template<typename SeedLoc>
+void FrequentSeeds::build_worker(
 	size_t seedp,
 	size_t thread_id,
 	DoubleArray<SeedLoc> *query_seed_hits,
@@ -89,13 +91,14 @@ void Frequent_seeds::build_worker(
 	(*counts)[seedp] = (unsigned)n;
 }
 
-void Frequent_seeds::build(unsigned sid, const SeedPartitionRange &range, DoubleArray<SeedLoc> *query_seed_hits, DoubleArray<SeedLoc> *ref_seed_hits, Search::Config& cfg)
+template<typename SeedLoc>
+void FrequentSeeds::build(unsigned sid, const SeedPartitionRange &range, DoubleArray<SeedLoc> *query_seed_hits, DoubleArray<SeedLoc> *ref_seed_hits, Search::Config& cfg)
 {
 	vector<Sd> ref_sds(range.size()), query_sds(range.size());
 	atomic<unsigned> seedp(range.begin());
 	vector<std::thread> threads;
 	for (int i = 0; i < config.threads_; ++i)
-		threads.emplace_back(compute_sd, &seedp, query_seed_hits, ref_seed_hits, &ref_sds, &query_sds);
+		threads.emplace_back(compute_sd<SeedLoc>, &seedp, query_seed_hits, ref_seed_hits, &ref_sds, &query_sds);
 	for (auto &t : threads)
 		t.join();
 
@@ -105,11 +108,14 @@ void Frequent_seeds::build(unsigned sid, const SeedPartitionRange &range, Double
 	log_stream << "Seed frequency mean (query) = " << query_sd.mean() << ", SD = " << query_sd.sd() << endl;
 	log_stream << "Seed frequency cap query: " << query_max_n << ", reference: " << ref_max_n << endl;
 	vector<unsigned> counts(Const::seedp);
-	Util::Parallel::scheduled_thread_pool_auto(config.threads_, Const::seedp, build_worker, query_seed_hits, ref_seed_hits, &range, sid, ref_max_n, query_max_n, &counts, &cfg);
+	Util::Parallel::scheduled_thread_pool_auto(config.threads_, Const::seedp, build_worker<SeedLoc>, query_seed_hits, ref_seed_hits, &range, sid, ref_max_n, query_max_n, &counts, &cfg);
 	log_stream << "Masked positions = " << std::accumulate(counts.begin(), counts.end(), 0) << std::endl;
 }
 
-void Frequent_seeds::clear_masking(SequenceSet& seqs) {
+template void FrequentSeeds::build(unsigned, const SeedPartitionRange&, DoubleArray<PackedLoc>*, DoubleArray<PackedLoc>*, Search::Config&);
+template void FrequentSeeds::build(unsigned, const SeedPartitionRange&, DoubleArray<PackedLocId>*, DoubleArray<PackedLocId>*, Search::Config&);
+
+void FrequentSeeds::clear_masking(SequenceSet& seqs) {
 	for (BlockId i = 0; i < seqs.size(); ++i) {
 		const size_t len = seqs.length(i);
 		Letter* p = seqs.ptr(i), *end = p + len;

diff --git a/src/data/frequent_seeds.h b/src/data/frequent_seeds.h
@@ -29,16 +29,18 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 struct SequenceSet;
 
-struct Frequent_seeds
+struct FrequentSeeds
 {
 
+	template<typename SeedLoc>
 	void build(unsigned sid, const SeedPartitionRange &range, DoubleArray<SeedLoc> *query_seed_hits, DoubleArray<SeedLoc> *ref_seed_hits, Search::Config& cfg);
 	static void clear_masking(SequenceSet& seqs);
 
 private:
 
 	static const double hash_table_factor;   
 
+	template<typename SeedLoc>
 	static void build_worker(
 		size_t seedp,
 		size_t thread_id,
@@ -53,4 +55,4 @@ struct Frequent_seeds
 
 };
 
-extern Frequent_seeds frequent_seeds;
+extern FrequentSeeds frequent_seeds;
diff --git a/src/data/seed_array.cpp b/src/data/seed_array.cpp
@@ -1,6 +1,6 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2013-2021 Max Planck Society for the Advancement of Science e.V.
+Copyright (C) 2013-2024 Max Planck Society for the Advancement of Science e.V.
                         Benjamin Buchfink
                         Eberhard Karls Universitaet Tuebingen
 						
@@ -30,13 +30,17 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 using std::array;
 using std::vector;
 
-typedef vector<array<SeedArray::Entry*, Const::seedp>> PtrSet;
+template<typename SeedLoc> using PtrSet = vector<array<typename SeedArray<SeedLoc>::Entry*, Const::seedp>>;
 
-char* SeedArray::alloc_buffer(const SeedHistogram &hst, int index_chunks)
+template<typename SeedLoc>
+char* SeedArray<SeedLoc>::alloc_buffer(const SeedHistogram &hst, int index_chunks)
 {
 	return new char[sizeof(Entry) * hst.max_chunk_size(index_chunks)];
 }
 
+template char* SeedArray<PackedLoc>::alloc_buffer(const SeedHistogram&, int);
+template char* SeedArray<PackedLocId>::alloc_buffer(const SeedHistogram&, int);
+
 static int seed_bits(const SeedEncoding code) {
 	switch (code) {
 	case SeedEncoding::HASHED:
@@ -51,10 +55,11 @@ static int seed_bits(const SeedEncoding code) {
 	throw std::runtime_error("Unknown seed encoding.");
 }
 
+template<typename SeedLoc>
 struct BufferedWriter
 {
 	static const unsigned BUFFER_SIZE = 16;
-	BufferedWriter(SeedArray::Entry* const* ptr)
+	BufferedWriter(typename SeedArray<SeedLoc>::Entry* const* ptr)
 	{
 		memset(n, 0, sizeof(n));
 		memcpy(this->ptr, ptr, sizeof(this->ptr));
@@ -64,14 +69,14 @@ struct BufferedWriter
 		const unsigned p = seed_partition(key);
 		if (range.contains(p)) {
 			assert(n[p] < BUFFER_SIZE);
-			buf[p][n[p]++] = SeedArray::Entry(seed_partition_offset(key), value, block_id);
+			buf[p][n[p]++] = typename SeedArray<SeedLoc>::Entry(seed_partition_offset(key), value, block_id);
 			if (n[p] == BUFFER_SIZE)
 				flush(p);
 		}
 	}
 	NO_INLINE void flush(unsigned p)
 	{
-		memcpy(ptr[p], buf[p], (size_t)n[p] * sizeof(SeedArray::Entry));
+		memcpy(ptr[p], buf[p], (size_t)n[p] * sizeof(typename SeedArray<SeedLoc>::Entry));
 		ptr[p] += n[p];
 		n[p] = 0;
 	}
@@ -81,13 +86,14 @@ struct BufferedWriter
 			if (n[p] > 0)
 				flush(p);
 	}
-	SeedArray::Entry *ptr[Const::seedp], buf[Const::seedp][BUFFER_SIZE];
+	typename SeedArray<SeedLoc>::Entry *ptr[Const::seedp], buf[Const::seedp][BUFFER_SIZE];
 	uint8_t n[Const::seedp];
 };
 
-PtrSet build_iterators(SeedArray &sa, const ShapeHistogram &hst)
+template<typename SeedLoc>
+PtrSet<SeedLoc> build_iterators(SeedArray<SeedLoc> &sa, const ShapeHistogram &hst)
 {
-	PtrSet iterators(hst.size());
+	PtrSet<SeedLoc> iterators(hst.size());
 	for (unsigned i = 0; i < Const::seedp; ++i)
 		iterators[0][i] = sa.begin(i);
 
@@ -97,11 +103,12 @@ PtrSet build_iterators(SeedArray &sa, const ShapeHistogram &hst)
 	return iterators;
 }
 
+template<typename SeedLoc>
 struct BuildCallback
 {
-	BuildCallback(const SeedPartitionRange &range, SeedArray::Entry* const* ptr) :
+	BuildCallback(const SeedPartitionRange &range, typename SeedArray<SeedLoc>::Entry* const* ptr) :
 		range(range),
-		it(new BufferedWriter(ptr))
+		it(new BufferedWriter<SeedLoc>(ptr))
 	{ }
 	bool operator()(uint64_t seed, uint64_t pos, uint32_t block_id, size_t shape)
 	{
@@ -117,11 +124,11 @@ struct BuildCallback
 		delete it;
 	}
 	SeedPartitionRange range;
-	BufferedWriter *it;
+	BufferedWriter<SeedLoc> *it;
 };
 
-template<typename _filter>
-SeedArray::SeedArray(Block &seqs, const ShapeHistogram &hst, const SeedPartitionRange &range, char *buffer, const _filter *filter, const EnumCfg& enum_cfg) :
+template<typename SeedLoc> template<typename Filter>
+SeedArray<SeedLoc>::SeedArray(Block &seqs, const ShapeHistogram &hst, const SeedPartitionRange &range, char *buffer, const Filter *filter, const EnumCfg& enum_cfg) :
 	key_bits(seed_bits(enum_cfg.code)),
 	data_((Entry*)buffer)
 {
@@ -131,17 +138,21 @@ SeedArray::SeedArray(Block &seqs, const ShapeHistogram &hst, const SeedPartition
 	for (int i = range.begin(); i < range.end(); ++i)
 		begin_[i + 1] = begin_[i] + partition_size(hst, i);
 
-	PtrSet iterators(build_iterators(*this, hst));
-	PtrVector<BuildCallback> cb;
+	PtrSet<SeedLoc> iterators(build_iterators(*this, hst));
+	PtrVector<BuildCallback<SeedLoc>> cb;
 	for (size_t i = 0; i < enum_cfg.partition->size() - 1; ++i)
-		cb.push_back(new BuildCallback(range, iterators[i].data()));
+		cb.push_back(new BuildCallback<SeedLoc>(range, iterators[i].data()));
 	stats_ = enum_seeds(seqs, cb, filter, enum_cfg);
 }
 
-template SeedArray::SeedArray(Block&, const ShapeHistogram&, const SeedPartitionRange &, char *buffer, const NoFilter *, const EnumCfg&);
-template SeedArray::SeedArray(Block&, const ShapeHistogram&, const SeedPartitionRange &, char *buffer, const SeedSet *, const EnumCfg&);
-template SeedArray::SeedArray(Block&, const ShapeHistogram&, const SeedPartitionRange &, char *buffer, const HashedSeedSet *, const EnumCfg&);
+template SeedArray<PackedLoc>::SeedArray(Block&, const ShapeHistogram&, const SeedPartitionRange &, char *buffer, const NoFilter *, const EnumCfg&);
+template SeedArray<PackedLoc>::SeedArray(Block&, const ShapeHistogram&, const SeedPartitionRange &, char *buffer, const SeedSet *, const EnumCfg&);
+template SeedArray<PackedLoc>::SeedArray(Block&, const ShapeHistogram&, const SeedPartitionRange &, char *buffer, const HashedSeedSet *, const EnumCfg&);
+template SeedArray<PackedLocId>::SeedArray(Block&, const ShapeHistogram&, const SeedPartitionRange&, char* buffer, const NoFilter*, const EnumCfg&);
+template SeedArray<PackedLocId>::SeedArray(Block&, const ShapeHistogram&, const SeedPartitionRange&, char* buffer, const SeedSet*, const EnumCfg&);
+template SeedArray<PackedLocId>::SeedArray(Block&, const ShapeHistogram&, const SeedPartitionRange&, char* buffer, const HashedSeedSet*, const EnumCfg&);
 
+template<typename SeedLoc>
 struct BufferedWriter2
 {
 	static const unsigned BUFFER_SIZE = 16;
@@ -155,7 +166,7 @@ struct BufferedWriter2
 		const unsigned p = seed_partition(key);
 		if (range.contains(p)) {
 			assert(n[p] < BUFFER_SIZE);
-			buf[p][n[p]++] = SeedArray::Entry(seed_partition_offset(key), value);
+			buf[p][n[p]++] = typename SeedArray<SeedLoc>::Entry(seed_partition_offset(key), value);
 			if (n[p] == BUFFER_SIZE)
 				flush(p);
 		}
@@ -171,16 +182,17 @@ struct BufferedWriter2
 			if (n[p] > 0)
 				flush(p);
 	}
-	vector<Deque<SeedArray::Entry, 15>> out;
-	SeedArray::Entry buf[Const::seedp][BUFFER_SIZE];
+	vector<Deque<typename SeedArray<SeedLoc>::Entry, 15>> out;
+	typename SeedArray<SeedLoc>::Entry buf[Const::seedp][BUFFER_SIZE];
 	uint8_t n[Const::seedp];
 };
 
+template<typename SeedLoc>
 struct BuildCallback2
 {
 	BuildCallback2(const SeedPartitionRange& range) :
 		range(range),
-		it(new BufferedWriter2())
+		it(new BufferedWriter2<SeedLoc>())
 	{ }
 	bool operator()(uint64_t seed, uint64_t pos, uint32_t block_id, size_t shape)
 	{
@@ -196,34 +208,35 @@ struct BuildCallback2
 		delete it;
 	}
 	SeedPartitionRange range;
-	BufferedWriter2* it;
+	BufferedWriter2<SeedLoc> * it;
 };
 
-template<typename _filter>
-SeedArray::SeedArray(Block& seqs, const SeedPartitionRange& range, const _filter* filter, EnumCfg& enum_cfg) :
+template<typename SeedLoc> template<typename Filter>
+SeedArray<SeedLoc>::SeedArray(Block& seqs, const SeedPartitionRange& range, const Filter* filter, EnumCfg& enum_cfg) :
 	key_bits(seed_bits(enum_cfg.code)),
 	data_(nullptr)
 {
 	if (enum_cfg.shape_end - enum_cfg.shape_begin > 1)
 		throw std::runtime_error("SeedArray construction for >1 shape.");
 	const auto seq_partition = seqs.seqs().partition(config.threads_);
-	PtrVector<BuildCallback2> cb;
+	PtrVector<BuildCallback2<SeedLoc>> cb;
 	for (size_t i = 0; i < seq_partition.size() - 1; ++i)
-		cb.push_back(new BuildCallback2(range));
+		cb.push_back(new BuildCallback2<SeedLoc>(range));
 	enum_cfg.partition = &seq_partition;
 	stats_ = enum_seeds(seqs, cb, filter, enum_cfg);
 
 	array<size_t, Const::seedp> counts;
 	counts.fill(0);
-	for (BuildCallback2* p : cb)
+	for (BuildCallback2<SeedLoc>* p : cb)
 		for (size_t i = 0; i < Const::seedp; ++i)
 			counts[i] += p->it->out[i].size();
 
 	for (size_t i = 0; i < Const::seedp; ++i) {
 		entries_[i].reserve(counts[i]);
-		for (BuildCallback2* p : cb)
+		for (BuildCallback2<SeedLoc>* p : cb)
 			p->it->out[i].move(entries_[i]);
 	}
 }
 
-template SeedArray::SeedArray(Block&, const SeedPartitionRange&, const HashedSeedSet*, EnumCfg&);
+template SeedArray<PackedLoc>::SeedArray(Block&, const SeedPartitionRange&, const HashedSeedSet*, EnumCfg&);
+template SeedArray<PackedLocId>::SeedArray(Block&, const SeedPartitionRange&, const HashedSeedSet*, EnumCfg&);
diff --git a/src/data/seed_array.h b/src/data/seed_array.h
@@ -27,54 +27,58 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 struct Block;
 
+template<typename SeedLoc>
 struct SeedArray
 {
 
-	using Loc = PackedLoc;
+	static PackedLoc make_seed_loc(PackedLoc pos, uint32_t block_id, PackedLoc) {
+		return pos;
+	}
+
+	static PackedLocId make_seed_loc(PackedLoc pos, uint32_t block_id, PackedLocId) {
+		return PackedLocId(pos, block_id);
+	}
 
 	struct Entry
 	{
 		Entry() :
 			key(),
 			value()
 		{ }
-		Entry(SeedOffset key, Loc value) :
+		Entry(SeedOffset key, PackedLoc value) :
 			key(key),
 			value(value)
 		{ }
-		Entry(SeedOffset key, Loc pos, uint32_t block_id):
+		Entry(SeedOffset key, PackedLoc pos, uint32_t block_id) :
 			key(key),
-#ifdef KEEP_TARGET_ID
-			value(pos, block_id)
-#else
-			value(pos)
-#endif
-        {}
-        bool operator<(const Entry& entry)const{
-            return  this->key < entry.key;
-        }
-        bool operator==(const Entry& entry)const{
-            return this->key == entry.key && this->value == entry.value;
-        }
-        SeedOffset key;
-
-        struct GetKey {
-            uint32_t operator()(const Entry& e) const {
-                return e.key;
-            }
-        };
+			value(make_seed_loc(pos, block_id, SeedLoc()))
+		{}
+		bool operator<(const Entry& entry)const {
+			return  this->key < entry.key;
+		}
+		bool operator==(const Entry& entry)const {
+			return this->key == entry.key && this->value == entry.value;
+		}
+		SeedOffset key;
+
+		struct GetKey {
+			uint32_t operator()(const Entry& e) const {
+				return e.key;
+			}
+		};
 
 		SeedLoc value;
 		using Key = decltype(key);
 		using Value = decltype(value);
-        using value_type =  Entry;
+		using value_type = Entry;
 	} PACKED_ATTRIBUTE;
 
-	template<typename _filter>
-	SeedArray(Block &seqs, const ShapeHistogram &hst, const SeedPartitionRange &range, char *buffer, const _filter *filter, const EnumCfg& enum_cfg);
 
-	template<typename _filter>
-	SeedArray(Block& seqs, const SeedPartitionRange& range, const _filter* filter, EnumCfg& cfg);
+	template<typename Filter>
+	SeedArray(Block &seqs, const ShapeHistogram &hst, const SeedPartitionRange &range, char *buffer, const Filter* filter, const EnumCfg& enum_cfg);
+
+	template<typename Filter>
+	SeedArray(Block& seqs, const SeedPartitionRange& range, const Filter* filter, EnumCfg& cfg);
 
 	Entry* begin(unsigned i)
 	{

diff --git a/src/data/sequence_file.cpp b/src/data/sequence_file.cpp
@@ -247,7 +247,7 @@ size_t SequenceFile::dict_block(const size_t ref_block)
 	return config.multiprocessing ? ref_block : 0;
 }
 
-Block* SequenceFile::load_seqs(const size_t max_letters, const BitVector* filter, LoadFlags flags, const Chunk& chunk)
+Block* SequenceFile::load_seqs(const int64_t max_letters, const BitVector* filter, LoadFlags flags, const Chunk& chunk)
 {
 	reopen();
 
@@ -786,7 +786,7 @@ template void SequenceFile::sub_db<vector<SuperBlockId>::const_iterator>(vector<
 
 template<typename It>
 FastaFile* SequenceFile::sub_db(It begin, It end, const string& file_name) {
-	FastaFile* f = new FastaFile(file_name, true, FastaFile::WriteAccess());
+	FastaFile* f = new FastaFile(file_name, true, FastaFile::WriteAccess(), Flags::NEED_LENGTH_LOOKUP);
 	sub_db(begin, end, f);
 	return f;
 }
@@ -900,7 +900,7 @@ vector<tuple<FastaFile*, vector<OId>, File*>> SequenceFile::length_sort(int64_t
 	files.reserve(block);
 	for (int i = 0; i < block; ++i) {
 		//files.emplace_back(new FastaFile("", true, FastaFile::WriteAccess()), vector<OId>(),
-		files.emplace_back(new FastaFile("", true, FastaFile::WriteAccess()), vector<OId>(),
+		files.emplace_back(new FastaFile("", true, FastaFile::WriteAccess(), Flags::NEED_LENGTH_LOOKUP), vector<OId>(),
 			new File(Schema{ ::Type::INT64 }, "", ::Flags::TEMP));
 		get<0>(files.back())->init_write();
 	}
@@ -922,4 +922,12 @@ vector<tuple<FastaFile*, vector<OId>, File*>> SequenceFile::length_sort(int64_t
 Util::Tsv::File& SequenceFile::seqid_file() {
 	seqid_file_->rewind();
 	return *seqid_file_;
+}
+
+size_t SequenceFile::letters_filtered(const BitVector& v) const {
+	size_t n = 0;
+	for (OId i = 0; i < v.size(); ++i)
+		if (v.get(i))
+			n += seq_length(i);
+	return n;
 }
diff --git a/src/data/sequence_file.h b/src/data/sequence_file.h
@@ -72,7 +72,8 @@ struct SequenceFile {
 		SELF_ALN_SCORES = 1 << 5,
 		NEED_LETTER_COUNT = 1 << 6,
 		ACC_TO_OID_MAPPING = 1 << 7,
-		OID_TO_ACC_MAPPING = 1 << 8
+		OID_TO_ACC_MAPPING = 1 << 8,
+		NEED_LENGTH_LOOKUP = 1 << 9
 	};
 
 	enum class FormatFlags {
@@ -130,13 +131,14 @@ struct SequenceFile {
 	virtual int64_t sequence_count() const = 0;
 	virtual int64_t sparse_sequence_count() const = 0;
 	virtual size_t letters() const = 0;
+	virtual size_t letters_filtered(const BitVector& v) const;
 	virtual int db_version() const = 0;
 	virtual int program_build_version() const = 0;
 	virtual bool read_seq(std::vector<Letter>& seq, std::string& id, std::vector<char>* quals = nullptr) = 0;
 	virtual Metadata metadata() const = 0;
 	virtual std::string taxon_scientific_name(TaxId taxid) const;
 	virtual int build_version() = 0;
-	virtual void create_partition_balanced(size_t max_letters) = 0;
+	virtual void create_partition_balanced(int64_t max_letters) = 0;
 	virtual void save_partition(const std::string& partition_file_name, const std::string& annotation = "") = 0;
 	virtual int get_n_partition_chunks() = 0;
 	virtual void set_seqinfo_ptr(OId i) = 0;
@@ -157,7 +159,7 @@ struct SequenceFile {
 	virtual ~SequenceFile();
 
 	Type type() const { return type_; }
-	Block* load_seqs(const size_t max_letters, const BitVector* filter = nullptr, LoadFlags flags = LoadFlags(3), const Chunk& chunk = Chunk());
+	Block* load_seqs(const int64_t max_letters, const BitVector* filter = nullptr, LoadFlags flags = LoadFlags(3), const Chunk& chunk = Chunk());
 	void get_seq();
 	Util::Tsv::File* make_seqid_list();
 	size_t total_blocks() const;
@@ -237,6 +239,7 @@ struct SequenceFile {
 	std::unique_ptr<TaxonomyNodes> taxon_nodes_;
 	std::unordered_map<std::string, OId> acc2oid_;
 	std::unique_ptr<Util::Tsv::File> seqid_file_;
+	std::vector<Loc> seq_length_;
 	StringSet acc_;
 
 private:

diff --git a/src/data/taxonomy_nodes.cpp b/src/data/taxonomy_nodes.cpp
@@ -25,7 +25,13 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../util/io/text_input_file.h"
 #include "../util/string/tokenizer.h"
 
-using namespace std;
+using std::string;
+using std::reverse;
+using std::vector;
+using std::set;
+using std::endl;
+using std::to_string;
+using std::runtime_error;
 
 TaxonomyNodes::TaxonomyNodes(const string& file_name, const bool init_cache)
 {
@@ -170,4 +176,20 @@ std::set<TaxId> TaxonomyNodes::rank_taxid(const std::vector<TaxId> &taxid, Rank
 	for (unsigned i : taxid)
 		r.insert(rank_taxid(i, rank));
 	return r;
+}
+
+vector<TaxId> TaxonomyNodes::lineage(TaxId taxid) const {
+	static const int MAX = 64;
+	vector<TaxId> out;
+	int n = 0;
+	while (true) {
+		if (taxid == 0 || taxid == 1)
+			break;
+		if (++n > MAX)
+			throw std::runtime_error("Path in taxonomy too long (TaxonomyNodes::lineage).");
+		out.push_back(taxid);
+		taxid = get_parent(taxid);
+	}
+	reverse(out.begin(), out.end());
+	return out;
 }
diff --git a/src/data/taxonomy_nodes.h b/src/data/taxonomy_nodes.h
@@ -73,6 +73,7 @@ struct TaxonomyNodes
 	unsigned get_lca(unsigned t1, unsigned t2) const;
 	bool contained(TaxId query, const std::set<TaxId> &filter);
 	bool contained(const std::vector<TaxId>& query, const std::set<TaxId> &filter);
+	std::vector<TaxId> lineage(TaxId taxid) const;
 
 private:
 

diff --git a/src/dna/chain.cpp b/src/dna/chain.cpp
@@ -0,0 +1,297 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2023 Vincent Spath
+
+Code developed by Vincent Spath <vincent.spath@tuebingen.mpg.de>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "chain.h"
+#include "../util/util.h"
+#include <cstdint>
+#include "../util/math/log2_fast.h"
+
+namespace Dna {
+
+
+ /**
+  * find the start of a chain
+  * @param max_drop
+  * @param score_end
+  * @param index_end
+  * @param aD
+  * @return index of the start of the chain
+  */
+static int64_t chain_start(const int32_t max_drop, const uint64_t score_end, const uint64_t index_end, const AnchorData &aD) {
+
+    int64_t i = index_end;
+    int64_t max_i = i;
+    int32_t max_s = 0;
+    // check for invalid or visited anchor
+    if (i < 0 || aD.temp_marking[i] != 0) return i;
+    // iterate over all anchors in chain while anchor has not been visited
+    do {
+        // move to the previous anchor in the chain
+        i = aD.predecessor_anchor[i];
+        // compute score difference between anchors (if valid anchor)
+        const int32_t s = i < 0 ? score_end : (int32_t)score_end - aD.best_score_anchor[i];
+        // new max score? (best extension so far)
+        if (s > max_s) {
+            max_s = s;
+            max_i = i;
+        }
+        // chain does not extend if score drops too much between anchors
+        else if (max_s - s > max_drop) break;
+    } while (i >= 0 && aD.temp_marking[i] == 0);
+    return max_i;
+}
+
+
+/**
+ * backtrack to find chains
+ * @param n
+ * @param aD
+ * @param min_chain_score
+ * @param max_drop
+ * @param hits
+ * @return vector of chains
+ */
+std::vector<Chain> chain_backtrack(AnchorData &aD, const int min_chain_score, const int max_drop, const SeedMatch *begin, bool reverse) {
+
+    // potential starting positions of valid chains (score, index)
+    std::vector<std::pair<uint64_t, uint64_t>> z_end;
+    for (int64_t i = 0; i < aD.best_score_anchor.size(); ++i)
+        if (aD.best_score_anchor[i] >= min_chain_score) z_end.emplace_back(aD.best_score_anchor[i], i);
+
+    // sort by score
+    std::sort(z_end.begin(), z_end.end());
+
+    // size of z_end is the number of potential end positions of valid chains
+    const int64_t n_z = z_end.size();
+    if (n_z == 0) return {};
+
+    // chains found during the chaining process
+    std::vector<Chain> chains;
+    aD.temp_marking.assign(aD.temp_marking.size(), 0);
+    // iterate over all potential end positions (highest to lowest)
+        for (int64_t k = n_z - 1; k >= 0; --k) {
+            // position not been used yet?
+            if (aD.temp_marking[z_end[k].second] != 0) continue;
+            // calculate the end of the current chain
+            const int64_t start_i = chain_start(max_drop, z_end[k].first, z_end[k].second, aD);
+            // iterate over all positions in chain, marks used anchors
+            Chain chain_t = Chain(reverse);
+            int64_t i; // so? brauche es ja für sc später
+            for (i = z_end[k].second; i != start_i; i = aD.predecessor_anchor[i]) {
+                aD.temp_marking[i] = 1;
+                chain_t.anchors.emplace_back(begin[i].i(), begin[i].j());
+            }
+            // score of current chain
+            const int32_t sc = i < 0 ? z_end[k].first : (int32_t)z_end[k].first - aD.best_score_anchor[i];
+            // valid chain?
+            // prev_n_chain_anchors stores the number of anchors before processing the current chain
+            if (sc >= min_chain_score && !chain_t.anchors.empty()){
+                chain_t.target_id = begin->id();
+                chain_t.chain_score = sc;
+                chains.push_back(std::move(chain_t));
+            }
+        }
+    return chains;
+}
+
+
+/**
+ * compute score between two anchors
+ * @param hit_i
+ * @param hit_j
+ * @param q_span
+ * @param p
+ * @return score of the extension
+ */
+int32_t compute_score(const SeedMatch &hit_i, const SeedMatch &hit_j, int q_span, const ChainingParameters &p) {
+
+    // distance on query
+    const int32_t dq = hit_i.i() - hit_j.i();
+    if (dq <= 0 || dq > p.MAX_DIST_X) return INT32_MIN;
+
+    // distance on target
+    const int32_t dr = hit_i.j() - hit_j.j();
+    if (dr == 0 || dr > p.MAX_DIST_Y) return INT32_MIN;
+
+    // absolute difference (in positions) between query and target
+    const int32_t dd = dr > dq? dr - dq : dq - dr;
+
+    // too big distance on query or target
+    if (dd > p.BAND_WIDTH) return INT32_MIN;
+
+    // smaller distance on query or target (gap)
+    const int32_t dg = std::min(dr, dq);
+
+    // initial score: smaller q_span or gap
+    int32_t sc = std::min(q_span, dg);
+    if (dd || dg > q_span) {
+        float lin_pen = p.CHAIN_PEN_GAP * (float)dd + p.CHAIN_PEN_SKIP * (float)dg;
+        float log_pen = dd >= 1? log2_ap(dd + 1) : 0.0f; // log2() only works for dd>=2
+        sc -= (int)(lin_pen + .5f * log_pen);
+    }
+
+    return sc;
+}
+
+
+
+/**
+* identifies the primary chains and their mapping quality
+* @param chains
+* @param kmer_size
+*/
+ void compute_primary_chains(std::vector<Chain> &chains, const int kmer_size) { // new version
+
+     std::vector<int> score_secondary(chains.size(), 0);
+     // first chain is always primary
+     std::vector<size_t> primary_chains = {0};
+     std::vector<int> chain_span;
+     chain_span.reserve(chains.size());
+     // pre compute chain span
+     for (Chain chain : chains) {
+         chain_span.push_back(chain.anchors[0].i + kmer_size - chain.anchors.back().i);
+     }
+
+     // iterate over all chains
+     for (size_t i = 1; i < chains.size(); ++i) {
+         // chain overlaps with a primary?
+         bool primary = true;
+         for (size_t c : primary_chains) {
+             // calculate overlap length between chains
+             int overlapLength = chains[i].overlap_query(chains[c], kmer_size);
+             // no overlap
+             if (overlapLength <= 0) {
+                 continue;
+             }
+
+             // calculate overlap percentage of shorter chain
+             double overlapPercentage = static_cast<double>(overlapLength) / std::min(chain_span[i], chain_span[c]);
+
+             if (overlapPercentage >= MIN_OVERLAP_PERCENTAGE) {
+                 primary = false;
+                 // Chain i is the best secondary to Chain c
+                 score_secondary[c] = std::max(score_secondary[c], chains[i].chain_score);
+             }
+         }
+         // primary chain added to vector
+         if (primary) {
+             primary_chains.push_back(i);
+         }
+     }
+
+     // mapping quality
+     for (size_t i : primary_chains) {
+         chains[i].compute_mapping_quality(score_secondary[i]);
+     }
+}
+
+
+/**
+ * dynamic programming chaining algorithm
+ * @param hits
+ * @param window
+ * @param kmer_size
+ * @param p
+ * @return vector of chains
+ */
+std::vector<Chain> chain_dp(const int window, const int kmer_size, const ChainingParameters &p, const SeedMatch* begin,
+                            const SeedMatch* end, bool reverse) {
+
+    // number of total matches
+    const auto n = std::distance(begin, end);
+
+    const int32_t max_drop = p.BAND_WIDTH; // zur Übersicht und testen
+    int32_t mmax_f = 0;
+
+    // initialize vectors for chaining
+    AnchorData aD(n);
+
+    //code is for 1 vector of matches for 1 query and 1 target
+    int64_t max_i_s = -1;
+    for (int64_t i = 0; i < n; ++i) {
+        int64_t max_j = -1;
+        int32_t max_f = kmer_size;
+        int32_t n_skip = 0;
+        // increase st until same id or t_pos of st is in range of i
+        int64_t st = 0;
+        while (st < i &&  begin[i].j() > begin[st].j() + p.MAX_DIST_X) ++st;
+        // stay in range of max iterations
+        st = std::max(st, i - p.MAX_ITER);
+        // iterate over all hits from st to i
+        int64_t j;
+        for (j = i - 1; j >= st; --j) {
+            int32_t sc = compute_score(begin[i], begin[j], kmer_size, p);
+            if (sc == INT32_MIN) continue;
+            sc += aD.best_score_anchor[j];
+            // new max score?
+            if (sc > max_f) {
+                max_f = sc;
+                max_j = j;
+                // pending skipped seeds?
+                if (n_skip > 0) --n_skip;
+            // already in chain?
+            } else if (aD.temp_marking[j] == (int32_t)i) {
+                // increment number of skipped seeds and break if too many
+                if (++n_skip > p.MAX_SKIP) break;
+            }
+            // updates the seed information if previous seed was part of the chain
+            if (aD.predecessor_anchor[j] >= 0) aD.temp_marking[aD.predecessor_anchor[j]] = i;
+        }
+        int64_t end_j = j;
+        if (max_i_s < 0 || begin[i].j() - begin[max_i_s].j() > (int64_t)p.MAX_DIST_X) {
+            int32_t max = INT32_MIN;
+            max_i_s = -1;
+            // find seed with the highest score
+            for (j = i - 1; j >= st; --j) {
+                if (max < aD.best_score_anchor[j]){
+                    max = aD.best_score_anchor[j];
+                    max_i_s = j;
+                }
+            }
+        }
+        // valid max score
+        if (max_i_s >= 0 && max_i_s < end_j) {
+            // score of extending the current anchor to the best scoring anchor
+            const int32_t tmp = compute_score(begin[i], begin[max_i_s], kmer_size, p);
+            // score is valid and higher than the current max score
+            if (tmp != INT32_MIN && max_f < tmp + aD.best_score_anchor[max_i_s]){
+                max_f = tmp + aD.best_score_anchor[max_i_s];
+                max_j = max_i_s;
+            }
+        }
+        // setting max score at seed i and index of the seed that contributes to the maximum score of the chain ending at seed i
+        // peak_score_anchor keeps the peak score up to i; best_scores_anchors is the score ending at i, not always the peak
+        aD.best_score_anchor[i] = max_f;
+        aD.predecessor_anchor[i] = max_j;
+        aD.peak_score_anchor[i] = max_j >= 0 && aD.peak_score_anchor[max_j] > max_f? aD.peak_score_anchor[max_j] : max_f;
+        if (max_i_s < 0 || (aD.best_score_anchor[max_i_s] < aD.best_score_anchor[i]))
+            max_i_s = i;
+        // maximum score in entire chaining
+        mmax_f = std::max(mmax_f, max_f);
+    }
+
+    return chain_backtrack(aD, p.MIN_CHAIN_SCORE, max_drop, begin, reverse);
+
+}
+
+// Chain constructor
+    Chain::Chain(bool rev) : chain_score(0), anchors(), target_id(0), mapping_quality(0), reverse(rev) {}
+
+}
diff --git a/src/dna/chain.h b/src/dna/chain.h
@@ -0,0 +1,106 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2023 Vincent Spath
+
+Code developed by Vincent Spath <vincent.spath@tuebingen.mpg.de>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "seed_set_dna.h"
+
+namespace Dna {
+
+const double MIN_OVERLAP_PERCENTAGE = 0.5;
+
+struct ChainingParameters {
+    int MAX_DIST_X = 5000;
+    int MAX_DIST_Y = 5000;
+    const int BAND_WIDTH = 500;
+    const int MAX_SKIP = 25;
+    const int MAX_ITER = 5000;
+    const int MIN_CHAIN_SCORE = 40;
+    const int MIN_NUMBER_MINIMIZERS = 3;
+    const float MAP_PERCENTAGE = 0.5;
+
+    const float CHAIN_PEN_GAP;
+    const float CHAIN_PEN_SKIP;
+
+    // constructor for additional parameters
+    ChainingParameters(float gap, float skip)
+            : CHAIN_PEN_GAP(gap), CHAIN_PEN_SKIP(skip) {}
+};
+
+struct AnchorData {
+    // optimal predecessor for each anchor
+    std::vector<int64_t> predecessor_anchor;
+    // best score for each anchor
+    std::vector<int32_t> best_score_anchor;
+    // best score up to now (peak)
+    std::vector<int32_t> peak_score_anchor;
+
+    std::vector<int32_t> temp_marking;
+
+    explicit AnchorData(int n) :
+            predecessor_anchor(n),
+            best_score_anchor(n),
+            peak_score_anchor(n),
+            temp_marking(n, 0) {}
+};
+
+
+
+struct Chain {
+    explicit Chain(bool rev);
+    int32_t chain_score;
+    BlockId target_id;
+    uint8_t mapping_quality;
+    bool reverse;
+
+    struct Anchor {
+        Loc i;
+        Loc j;
+        Anchor(Loc i, Loc j) : i(i), j(j) {}
+    };
+    // query/target starting position in reverse (i, j)
+    std::vector<Anchor> anchors;
+
+    int overlap_query(const Chain &other, const int kmer_size) const {
+        return (std::min(anchors[0].i, other.anchors[0].i) + kmer_size) -
+                            std::max(anchors.back().i, other.anchors.back().i);
+
+    }
+
+    void compute_mapping_quality(int score_secondary) {
+        double sc_ratio = static_cast<double>(score_secondary) / chain_score;
+        double quality = 40 * (1 - sc_ratio) * std::min(1.0, static_cast<double>(anchors.size()) / 10) * std::log(chain_score);
+        // compress to 0-60
+        mapping_quality = static_cast<uint8_t>(quality * 60 / 312);
+    }
+
+    bool operator>(const Chain& other)const
+    {return this->chain_score > other.chain_score;}
+
+};
+
+
+
+
+
+
+std::vector<Chain> chain_dp(int window, int kmer_size, const ChainingParameters &p, const SeedMatch* begin,
+                            const SeedMatch* end, bool reverse);
+void compute_primary_chains(std::vector<Chain> &chains, int kmer_size);
+}
+